Muennighoff
commited on
Commit
•
6603402
1
Parent(s):
862213b
Add files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +153 -0
- 4b284b12bc4/eval/merged.csv +587 -0
- 4b284b12bc4/eval/merged.json +0 -0
- 4b284b17bc4/eval/merged.csv +587 -0
- 4b284b17bc4/eval/merged.json +0 -0
- 4b284b21bc4/eval/merged.csv +587 -0
- 4b284b21bc4/eval/merged.json +0 -0
- 4b284b28bc4/eval/merged.csv +587 -0
- 4b284b28bc4/eval/merged.json +0 -0
- 4b284b42bc4/eval/merged.csv +587 -0
- 4b284b42bc4/eval/merged.json +0 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_2.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_3.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_4.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_2.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_3.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_4.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_2.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_3.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_4.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_2.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_3.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_4.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_2.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_3.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_4.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_2.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_3.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_4.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_2.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_3.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_4.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_2.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_3.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_4.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_2.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_3.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_4.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_2.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_3.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_4.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_4.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_5.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_4.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_5.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_4.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_5.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_4.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_5.json +1 -0
- 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_4.json +1 -0
.gitattributes
CHANGED
@@ -2939,3 +2939,156 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
2939 |
4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
2940 |
4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2941 |
4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2939 |
4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
|
2940 |
4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2941 |
4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
|
2942 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2943 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2944 |
+
4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2945 |
+
4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2946 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2947 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2948 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2949 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2950 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2951 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2952 |
+
4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2953 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2954 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2955 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2956 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2957 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2958 |
+
4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2959 |
+
4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2960 |
+
4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2961 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2962 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2963 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2964 |
+
4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2965 |
+
4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2966 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2967 |
+
4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2968 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2969 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2970 |
+
4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2971 |
+
4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2972 |
+
4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2973 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2974 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2975 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2976 |
+
4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2977 |
+
4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2978 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2979 |
+
4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2980 |
+
4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2981 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2982 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2983 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2984 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2985 |
+
4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2986 |
+
4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2987 |
+
4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2988 |
+
4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2989 |
+
4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2990 |
+
4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2991 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2992 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2993 |
+
4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2994 |
+
4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
2995 |
+
4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2996 |
+
4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2997 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2998 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
2999 |
+
4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3000 |
+
4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3001 |
+
4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3002 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3003 |
+
4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3004 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3005 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3006 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3007 |
+
4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3008 |
+
4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3009 |
+
4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3010 |
+
4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3011 |
+
4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3012 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3013 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3014 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3015 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3016 |
+
4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3017 |
+
4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3018 |
+
4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3019 |
+
4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3020 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3021 |
+
4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3022 |
+
4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3023 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3024 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3025 |
+
4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3026 |
+
4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3027 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3028 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3029 |
+
4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3030 |
+
4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3031 |
+
4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3032 |
+
4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3033 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3034 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3035 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3036 |
+
4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3037 |
+
4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3038 |
+
4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3039 |
+
4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3040 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3041 |
+
4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3042 |
+
4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3043 |
+
4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3044 |
+
4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3045 |
+
4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3046 |
+
4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3047 |
+
4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3048 |
+
4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3049 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3050 |
+
4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3051 |
+
4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3052 |
+
4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3053 |
+
4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3054 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3055 |
+
4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3056 |
+
4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3057 |
+
4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3058 |
+
4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3059 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3060 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3061 |
+
4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3062 |
+
4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3063 |
+
4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3064 |
+
4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3065 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3066 |
+
4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3067 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3068 |
+
4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3069 |
+
4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3070 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3071 |
+
4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3072 |
+
4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3073 |
+
4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3074 |
+
4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3075 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3076 |
+
4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3077 |
+
4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3078 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3079 |
+
4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3080 |
+
4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3081 |
+
4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3082 |
+
4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3083 |
+
4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3084 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3085 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3086 |
+
4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3087 |
+
4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3088 |
+
4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3089 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3090 |
+
4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3091 |
+
4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3092 |
+
4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_4.jsonl filter=lfs diff=lfs merge=lfs -text
|
3093 |
+
4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
3094 |
+
4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
|
4b284b12bc4/eval/merged.csv
ADDED
@@ -0,0 +1,587 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
anli_r1,0,GPT-3 style,acc,0.334
|
3 |
+
anli_r1,0,MNLI crowdsource,acc,0.334
|
4 |
+
anli_r1,0,can we infer,acc,0.336
|
5 |
+
anli_r1,0,guaranteed/possible/impossible,acc,0.323
|
6 |
+
anli_r1,0,justified in saying,acc,0.329
|
7 |
+
anli_r1,0,median,accuracy,0.334
|
8 |
+
anli_r1,1,GPT-3 style,acc,0.334
|
9 |
+
anli_r1,1,MNLI crowdsource,acc,0.333
|
10 |
+
anli_r1,1,can we infer,acc,0.325
|
11 |
+
anli_r1,1,guaranteed/possible/impossible,acc,0.33
|
12 |
+
anli_r1,1,justified in saying,acc,0.327
|
13 |
+
anli_r1,1,median,accuracy,0.33
|
14 |
+
anli_r1,2,GPT-3 style,acc,0.349
|
15 |
+
anli_r1,2,MNLI crowdsource,acc,0.361
|
16 |
+
anli_r1,2,can we infer,acc,0.352
|
17 |
+
anli_r1,2,guaranteed/possible/impossible,acc,0.323
|
18 |
+
anli_r1,2,justified in saying,acc,0.345
|
19 |
+
anli_r1,2,median,accuracy,0.349
|
20 |
+
anli_r1,3,GPT-3 style,acc,0.33
|
21 |
+
anli_r1,3,MNLI crowdsource,acc,0.335
|
22 |
+
anli_r1,3,can we infer,acc,0.345
|
23 |
+
anli_r1,3,guaranteed/possible/impossible,acc,0.32
|
24 |
+
anli_r1,3,justified in saying,acc,0.349
|
25 |
+
anli_r1,3,median,accuracy,0.335
|
26 |
+
anli_r1,4,GPT-3 style,acc,0.318
|
27 |
+
anli_r1,4,MNLI crowdsource,acc,0.332
|
28 |
+
anli_r1,4,can we infer,acc,0.327
|
29 |
+
anli_r1,4,guaranteed/possible/impossible,acc,0.309
|
30 |
+
anli_r1,4,justified in saying,acc,0.333
|
31 |
+
anli_r1,4,median,accuracy,0.327
|
32 |
+
anli_r1,5,GPT-3 style,acc,0.321
|
33 |
+
anli_r1,5,MNLI crowdsource,acc,0.343
|
34 |
+
anli_r1,5,can we infer,acc,0.315
|
35 |
+
anli_r1,5,guaranteed/possible/impossible,acc,0.33
|
36 |
+
anli_r1,5,justified in saying,acc,0.333
|
37 |
+
anli_r1,5,median,accuracy,0.33
|
38 |
+
anli_r1,5,average,multiple,0.33416666666666667
|
39 |
+
anli_r2,0,GPT-3 style,acc,0.336
|
40 |
+
anli_r2,0,MNLI crowdsource,acc,0.334
|
41 |
+
anli_r2,0,can we infer,acc,0.336
|
42 |
+
anli_r2,0,guaranteed/possible/impossible,acc,0.325
|
43 |
+
anli_r2,0,justified in saying,acc,0.319
|
44 |
+
anli_r2,0,median,accuracy,0.334
|
45 |
+
anli_r2,1,GPT-3 style,acc,0.305
|
46 |
+
anli_r2,1,MNLI crowdsource,acc,0.315
|
47 |
+
anli_r2,1,can we infer,acc,0.312
|
48 |
+
anli_r2,1,guaranteed/possible/impossible,acc,0.313
|
49 |
+
anli_r2,1,justified in saying,acc,0.314
|
50 |
+
anli_r2,1,median,accuracy,0.313
|
51 |
+
anli_r2,2,GPT-3 style,acc,0.305
|
52 |
+
anli_r2,2,MNLI crowdsource,acc,0.336
|
53 |
+
anli_r2,2,can we infer,acc,0.332
|
54 |
+
anli_r2,2,guaranteed/possible/impossible,acc,0.328
|
55 |
+
anli_r2,2,justified in saying,acc,0.335
|
56 |
+
anli_r2,2,median,accuracy,0.332
|
57 |
+
anli_r2,3,GPT-3 style,acc,0.317
|
58 |
+
anli_r2,3,MNLI crowdsource,acc,0.311
|
59 |
+
anli_r2,3,can we infer,acc,0.333
|
60 |
+
anli_r2,3,guaranteed/possible/impossible,acc,0.335
|
61 |
+
anli_r2,3,justified in saying,acc,0.339
|
62 |
+
anli_r2,3,median,accuracy,0.333
|
63 |
+
anli_r2,4,GPT-3 style,acc,0.313
|
64 |
+
anli_r2,4,MNLI crowdsource,acc,0.323
|
65 |
+
anli_r2,4,can we infer,acc,0.317
|
66 |
+
anli_r2,4,guaranteed/possible/impossible,acc,0.34
|
67 |
+
anli_r2,4,justified in saying,acc,0.319
|
68 |
+
anli_r2,4,median,accuracy,0.319
|
69 |
+
anli_r2,5,GPT-3 style,acc,0.324
|
70 |
+
anli_r2,5,MNLI crowdsource,acc,0.338
|
71 |
+
anli_r2,5,can we infer,acc,0.327
|
72 |
+
anli_r2,5,guaranteed/possible/impossible,acc,0.337
|
73 |
+
anli_r2,5,justified in saying,acc,0.315
|
74 |
+
anli_r2,5,median,accuracy,0.327
|
75 |
+
anli_r2,5,average,multiple,0.32633333333333336
|
76 |
+
anli_r3,0,GPT-3 style,acc,0.3383333333333333
|
77 |
+
anli_r3,0,MNLI crowdsource,acc,0.33666666666666667
|
78 |
+
anli_r3,0,can we infer,acc,0.33916666666666667
|
79 |
+
anli_r3,0,guaranteed/possible/impossible,acc,0.2991666666666667
|
80 |
+
anli_r3,0,justified in saying,acc,0.3433333333333333
|
81 |
+
anli_r3,0,median,accuracy,0.3383333333333333
|
82 |
+
anli_r3,1,GPT-3 style,acc,0.3325
|
83 |
+
anli_r3,1,MNLI crowdsource,acc,0.3358333333333333
|
84 |
+
anli_r3,1,can we infer,acc,0.3408333333333333
|
85 |
+
anli_r3,1,guaranteed/possible/impossible,acc,0.33666666666666667
|
86 |
+
anli_r3,1,justified in saying,acc,0.33916666666666667
|
87 |
+
anli_r3,1,median,accuracy,0.33666666666666667
|
88 |
+
anli_r3,2,GPT-3 style,acc,0.32416666666666666
|
89 |
+
anli_r3,2,MNLI crowdsource,acc,0.32
|
90 |
+
anli_r3,2,can we infer,acc,0.31166666666666665
|
91 |
+
anli_r3,2,guaranteed/possible/impossible,acc,0.305
|
92 |
+
anli_r3,2,justified in saying,acc,0.30416666666666664
|
93 |
+
anli_r3,2,median,accuracy,0.31166666666666665
|
94 |
+
anli_r3,3,GPT-3 style,acc,0.3408333333333333
|
95 |
+
anli_r3,3,MNLI crowdsource,acc,0.35
|
96 |
+
anli_r3,3,can we infer,acc,0.3333333333333333
|
97 |
+
anli_r3,3,guaranteed/possible/impossible,acc,0.31916666666666665
|
98 |
+
anli_r3,3,justified in saying,acc,0.3441666666666667
|
99 |
+
anli_r3,3,median,accuracy,0.3408333333333333
|
100 |
+
anli_r3,4,GPT-3 style,acc,0.33166666666666667
|
101 |
+
anli_r3,4,MNLI crowdsource,acc,0.3275
|
102 |
+
anli_r3,4,can we infer,acc,0.3383333333333333
|
103 |
+
anli_r3,4,guaranteed/possible/impossible,acc,0.3375
|
104 |
+
anli_r3,4,justified in saying,acc,0.3358333333333333
|
105 |
+
anli_r3,4,median,accuracy,0.3358333333333333
|
106 |
+
anli_r3,5,GPT-3 style,acc,0.32166666666666666
|
107 |
+
anli_r3,5,MNLI crowdsource,acc,0.32
|
108 |
+
anli_r3,5,can we infer,acc,0.33666666666666667
|
109 |
+
anli_r3,5,guaranteed/possible/impossible,acc,0.32666666666666666
|
110 |
+
anli_r3,5,justified in saying,acc,0.32416666666666666
|
111 |
+
anli_r3,5,median,accuracy,0.32416666666666666
|
112 |
+
anli_r3,5,average,multiple,0.33125
|
113 |
+
arc_easy,0,heres_a_problem,acc,0.23890784982935154
|
114 |
+
arc_easy,0,i_am_hesitating,acc,0.3042929292929293
|
115 |
+
arc_easy,0,multiple_choice,acc,0.25715488215488214
|
116 |
+
arc_easy,0,pick_the_most_correct_option,acc,0.22866894197952217
|
117 |
+
arc_easy,0,qa_options,acc,0.2525597269624573
|
118 |
+
arc_easy,0,median,accuracy,0.2525597269624573
|
119 |
+
arc_easy,1,heres_a_problem,acc,0.2398989898989899
|
120 |
+
arc_easy,1,i_am_hesitating,acc,0.2627986348122867
|
121 |
+
arc_easy,1,multiple_choice,acc,0.2836700336700337
|
122 |
+
arc_easy,1,pick_the_most_correct_option,acc,0.23122866894197952
|
123 |
+
arc_easy,1,qa_options,acc,0.25426621160409557
|
124 |
+
arc_easy,1,median,accuracy,0.25426621160409557
|
125 |
+
arc_easy,2,heres_a_problem,acc,0.24494949494949494
|
126 |
+
arc_easy,2,i_am_hesitating,acc,0.2946127946127946
|
127 |
+
arc_easy,2,multiple_choice,acc,0.23293515358361774
|
128 |
+
arc_easy,2,pick_the_most_correct_option,acc,0.2354948805460751
|
129 |
+
arc_easy,2,qa_options,acc,0.31523569023569026
|
130 |
+
arc_easy,2,median,accuracy,0.24494949494949494
|
131 |
+
arc_easy,3,heres_a_problem,acc,0.25336700336700335
|
132 |
+
arc_easy,3,i_am_hesitating,acc,0.26791808873720135
|
133 |
+
arc_easy,3,multiple_choice,acc,0.2431740614334471
|
134 |
+
arc_easy,3,pick_the_most_correct_option,acc,0.24061433447098976
|
135 |
+
arc_easy,3,qa_options,acc,0.31734006734006737
|
136 |
+
arc_easy,3,median,accuracy,0.25336700336700335
|
137 |
+
arc_easy,4,heres_a_problem,acc,0.2380546075085324
|
138 |
+
arc_easy,4,i_am_hesitating,acc,0.29713804713804715
|
139 |
+
arc_easy,4,multiple_choice,acc,0.2908249158249158
|
140 |
+
arc_easy,4,pick_the_most_correct_option,acc,0.2361111111111111
|
141 |
+
arc_easy,4,qa_options,acc,0.26791808873720135
|
142 |
+
arc_easy,4,median,accuracy,0.26791808873720135
|
143 |
+
arc_easy,5,heres_a_problem,acc,0.2226962457337884
|
144 |
+
arc_easy,5,i_am_hesitating,acc,0.30303030303030304
|
145 |
+
arc_easy,5,multiple_choice,acc,0.2967171717171717
|
146 |
+
arc_easy,5,pick_the_most_correct_option,acc,0.24957912457912457
|
147 |
+
arc_easy,5,qa_options,acc,0.2619453924914676
|
148 |
+
arc_easy,5,median,accuracy,0.2619453924914676
|
149 |
+
arc_easy,5,average,multiple,0.2558343196852867
|
150 |
+
boolq,0,GPT-3 Style,acc,0.6163333333333333
|
151 |
+
boolq,0,after_reading,acc,0.622
|
152 |
+
boolq,0,exercise,acc,0.6236666666666667
|
153 |
+
boolq,0,valid_binary,acc,0.565
|
154 |
+
boolq,0,yes_no_question,acc,0.5426666666666666
|
155 |
+
boolq,0,median,accuracy,0.6163333333333333
|
156 |
+
boolq,1,GPT-3 Style,acc,0.596
|
157 |
+
boolq,1,after_reading,acc,0.546
|
158 |
+
boolq,1,exercise,acc,0.5566666666666666
|
159 |
+
boolq,1,valid_binary,acc,0.5693333333333334
|
160 |
+
boolq,1,yes_no_question,acc,0.5436666666666666
|
161 |
+
boolq,1,median,accuracy,0.5566666666666666
|
162 |
+
boolq,2,GPT-3 Style,acc,0.5923333333333334
|
163 |
+
boolq,2,after_reading,acc,0.5926666666666667
|
164 |
+
boolq,2,exercise,acc,0.576
|
165 |
+
boolq,2,valid_binary,acc,0.5973333333333334
|
166 |
+
boolq,2,yes_no_question,acc,0.562
|
167 |
+
boolq,2,median,accuracy,0.5923333333333334
|
168 |
+
boolq,3,GPT-3 Style,acc,0.6083333333333333
|
169 |
+
boolq,3,after_reading,acc,0.58
|
170 |
+
boolq,3,exercise,acc,0.5796666666666667
|
171 |
+
boolq,3,valid_binary,acc,0.5966666666666667
|
172 |
+
boolq,3,yes_no_question,acc,0.5646666666666667
|
173 |
+
boolq,3,median,accuracy,0.58
|
174 |
+
boolq,4,GPT-3 Style,acc,0.6136666666666667
|
175 |
+
boolq,4,after_reading,acc,0.5633333333333334
|
176 |
+
boolq,4,exercise,acc,0.593
|
177 |
+
boolq,4,valid_binary,acc,0.5913333333333334
|
178 |
+
boolq,4,yes_no_question,acc,0.5516666666666666
|
179 |
+
boolq,4,median,accuracy,0.5913333333333334
|
180 |
+
boolq,5,GPT-3 Style,acc,0.609
|
181 |
+
boolq,5,after_reading,acc,0.5546666666666666
|
182 |
+
boolq,5,exercise,acc,0.5896666666666667
|
183 |
+
boolq,5,valid_binary,acc,0.583
|
184 |
+
boolq,5,yes_no_question,acc,0.5483333333333333
|
185 |
+
boolq,5,median,accuracy,0.583
|
186 |
+
boolq,5,average,multiple,0.5866111111111111
|
187 |
+
cb,0,GPT-3 style,acc,0.4107142857142857
|
188 |
+
cb,0,MNLI crowdsource,acc,0.4107142857142857
|
189 |
+
cb,0,can we infer,acc,0.2857142857142857
|
190 |
+
cb,0,guaranteed/possible/impossible,acc,0.42857142857142855
|
191 |
+
cb,0,justified in saying,acc,0.19642857142857142
|
192 |
+
cb,0,median,accuracy,0.4107142857142857
|
193 |
+
cb,1,GPT-3 style,acc,0.39285714285714285
|
194 |
+
cb,1,MNLI crowdsource,acc,0.39285714285714285
|
195 |
+
cb,1,can we infer,acc,0.39285714285714285
|
196 |
+
cb,1,guaranteed/possible/impossible,acc,0.39285714285714285
|
197 |
+
cb,1,justified in saying,acc,0.44642857142857145
|
198 |
+
cb,1,median,accuracy,0.39285714285714285
|
199 |
+
cb,2,GPT-3 style,acc,0.42857142857142855
|
200 |
+
cb,2,MNLI crowdsource,acc,0.44642857142857145
|
201 |
+
cb,2,can we infer,acc,0.42857142857142855
|
202 |
+
cb,2,guaranteed/possible/impossible,acc,0.44642857142857145
|
203 |
+
cb,2,justified in saying,acc,0.42857142857142855
|
204 |
+
cb,2,median,accuracy,0.42857142857142855
|
205 |
+
cb,3,GPT-3 style,acc,0.39285714285714285
|
206 |
+
cb,3,MNLI crowdsource,acc,0.3392857142857143
|
207 |
+
cb,3,can we infer,acc,0.44642857142857145
|
208 |
+
cb,3,guaranteed/possible/impossible,acc,0.375
|
209 |
+
cb,3,justified in saying,acc,0.375
|
210 |
+
cb,3,median,accuracy,0.375
|
211 |
+
cb,4,GPT-3 style,acc,0.4107142857142857
|
212 |
+
cb,4,MNLI crowdsource,acc,0.39285714285714285
|
213 |
+
cb,4,can we infer,acc,0.42857142857142855
|
214 |
+
cb,4,guaranteed/possible/impossible,acc,0.5357142857142857
|
215 |
+
cb,4,justified in saying,acc,0.44642857142857145
|
216 |
+
cb,4,median,accuracy,0.42857142857142855
|
217 |
+
cb,5,GPT-3 style,acc,0.48214285714285715
|
218 |
+
cb,5,MNLI crowdsource,acc,0.4107142857142857
|
219 |
+
cb,5,can we infer,acc,0.375
|
220 |
+
cb,5,guaranteed/possible/impossible,acc,0.375
|
221 |
+
cb,5,justified in saying,acc,0.39285714285714285
|
222 |
+
cb,5,median,accuracy,0.39285714285714285
|
223 |
+
cb,5,average,multiple,0.40476190476190477
|
224 |
+
copa,0,best_option,acc,0.53
|
225 |
+
copa,0,cause_effect,acc,0.52
|
226 |
+
copa,0,choose,acc,0.49
|
227 |
+
copa,0,i_am_hesitating,acc,0.54
|
228 |
+
copa,0,plausible_alternatives,acc,0.53
|
229 |
+
copa,0,median,accuracy,0.53
|
230 |
+
copa,1,best_option,acc,0.59
|
231 |
+
copa,1,cause_effect,acc,0.46
|
232 |
+
copa,1,choose,acc,0.45
|
233 |
+
copa,1,i_am_hesitating,acc,0.45
|
234 |
+
copa,1,plausible_alternatives,acc,0.46
|
235 |
+
copa,1,median,accuracy,0.46
|
236 |
+
copa,2,best_option,acc,0.51
|
237 |
+
copa,2,cause_effect,acc,0.45
|
238 |
+
copa,2,choose,acc,0.45
|
239 |
+
copa,2,i_am_hesitating,acc,0.49
|
240 |
+
copa,2,plausible_alternatives,acc,0.46
|
241 |
+
copa,2,median,accuracy,0.46
|
242 |
+
copa,3,best_option,acc,0.55
|
243 |
+
copa,3,cause_effect,acc,0.47
|
244 |
+
copa,3,choose,acc,0.49
|
245 |
+
copa,3,i_am_hesitating,acc,0.48
|
246 |
+
copa,3,plausible_alternatives,acc,0.49
|
247 |
+
copa,3,median,accuracy,0.49
|
248 |
+
copa,4,best_option,acc,0.49
|
249 |
+
copa,4,cause_effect,acc,0.48
|
250 |
+
copa,4,choose,acc,0.51
|
251 |
+
copa,4,i_am_hesitating,acc,0.51
|
252 |
+
copa,4,plausible_alternatives,acc,0.48
|
253 |
+
copa,4,median,accuracy,0.49
|
254 |
+
copa,5,best_option,acc,0.54
|
255 |
+
copa,5,cause_effect,acc,0.51
|
256 |
+
copa,5,choose,acc,0.46
|
257 |
+
copa,5,i_am_hesitating,acc,0.51
|
258 |
+
copa,5,plausible_alternatives,acc,0.5
|
259 |
+
copa,5,median,accuracy,0.51
|
260 |
+
copa,5,average,multiple,0.49
|
261 |
+
e2e_nlg_cleaned,0,coherent_text,rouge2_fmeasure,0.15645061177192066
|
262 |
+
e2e_nlg_cleaned,0,create_text_for_me,rouge2_fmeasure,0.06347842363431547
|
263 |
+
e2e_nlg_cleaned,0,generate_gramatically_correct_text,rouge2_fmeasure,0.00012067093428409366
|
264 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.00024104025657346095
|
265 |
+
e2e_nlg_cleaned,0,text,rouge2_fmeasure,0.10910465326076894
|
266 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.06347842363431547
|
267 |
+
e2e_nlg_cleaned,1,coherent_text,rouge2_fmeasure,0.1870937559813721
|
268 |
+
e2e_nlg_cleaned,1,create_text_for_me,rouge2_fmeasure,0.16511209673657395
|
269 |
+
e2e_nlg_cleaned,1,generate_gramatically_correct_text,rouge2_fmeasure,0.025195913355673966
|
270 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.1714205638298909
|
271 |
+
e2e_nlg_cleaned,1,text,rouge2_fmeasure,0.20219167803744306
|
272 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1714205638298909
|
273 |
+
e2e_nlg_cleaned,2,coherent_text,rouge2_fmeasure,0.18600518275150685
|
274 |
+
e2e_nlg_cleaned,2,create_text_for_me,rouge2_fmeasure,0.17074360575215342
|
275 |
+
e2e_nlg_cleaned,2,generate_gramatically_correct_text,rouge2_fmeasure,0.04447784117945149
|
276 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.19259169221915515
|
277 |
+
e2e_nlg_cleaned,2,text,rouge2_fmeasure,0.19722529213201134
|
278 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.18600518275150685
|
279 |
+
e2e_nlg_cleaned,3,coherent_text,rouge2_fmeasure,0.18307097946148873
|
280 |
+
e2e_nlg_cleaned,3,create_text_for_me,rouge2_fmeasure,0.17213478001357976
|
281 |
+
e2e_nlg_cleaned,3,generate_gramatically_correct_text,rouge2_fmeasure,0.038284747118588126
|
282 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.19636018570824587
|
283 |
+
e2e_nlg_cleaned,3,text,rouge2_fmeasure,0.1964954395976402
|
284 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.18307097946148873
|
285 |
+
e2e_nlg_cleaned,4,coherent_text,rouge2_fmeasure,0.19134136835621748
|
286 |
+
e2e_nlg_cleaned,4,create_text_for_me,rouge2_fmeasure,0.17010384910521295
|
287 |
+
e2e_nlg_cleaned,4,generate_gramatically_correct_text,rouge2_fmeasure,0.037516989850184534
|
288 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.19590832872090894
|
289 |
+
e2e_nlg_cleaned,4,text,rouge2_fmeasure,0.19536984000862256
|
290 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.19134136835621748
|
291 |
+
e2e_nlg_cleaned,5,coherent_text,rouge2_fmeasure,0.18872128486346074
|
292 |
+
e2e_nlg_cleaned,5,create_text_for_me,rouge2_fmeasure,0.1683711858028947
|
293 |
+
e2e_nlg_cleaned,5,generate_gramatically_correct_text,rouge2_fmeasure,0.038242180726931196
|
294 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.19402158147865167
|
295 |
+
e2e_nlg_cleaned,5,text,rouge2_fmeasure,0.19119099944111612
|
296 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.18872128486346074
|
297 |
+
e2e_nlg_cleaned,5,average,multiple,0.16400630048281337
|
298 |
+
gem_xsum,0,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.014155568509608755
|
299 |
+
gem_xsum,0,DOC_tldr,rouge2_fmeasure,0.005848067139995684
|
300 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.01730052045113504
|
301 |
+
gem_xsum,0,summarize_DOC,rouge2_fmeasure,0.031013676801335422
|
302 |
+
gem_xsum,0,summarize_this_DOC_summary,rouge2_fmeasure,0.040900489822348056
|
303 |
+
gem_xsum,0,median,rouge2_fmeasure,0.01730052045113504
|
304 |
+
gem_xsum,1,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.020262527556005907
|
305 |
+
gem_xsum,1,DOC_tldr,rouge2_fmeasure,0.012072025290438592
|
306 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.019132118327200527
|
307 |
+
gem_xsum,1,summarize_DOC,rouge2_fmeasure,0.04334620232538617
|
308 |
+
gem_xsum,1,summarize_this_DOC_summary,rouge2_fmeasure,0.038774277981477374
|
309 |
+
gem_xsum,1,median,rouge2_fmeasure,0.020262527556005907
|
310 |
+
gem_xsum,2,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.02824595859604695
|
311 |
+
gem_xsum,2,DOC_tldr,rouge2_fmeasure,0.02751335673945438
|
312 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.026545543337132424
|
313 |
+
gem_xsum,2,summarize_DOC,rouge2_fmeasure,0.04362070001507444
|
314 |
+
gem_xsum,2,summarize_this_DOC_summary,rouge2_fmeasure,0.03664914264570665
|
315 |
+
gem_xsum,2,median,rouge2_fmeasure,0.02824595859604695
|
316 |
+
gem_xsum,3,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.02800561543388405
|
317 |
+
gem_xsum,3,DOC_tldr,rouge2_fmeasure,0.0402095932041227
|
318 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03291830334125208
|
319 |
+
gem_xsum,3,summarize_DOC,rouge2_fmeasure,0.0400453211123096
|
320 |
+
gem_xsum,3,summarize_this_DOC_summary,rouge2_fmeasure,0.03701973106444136
|
321 |
+
gem_xsum,3,median,rouge2_fmeasure,0.03701973106444136
|
322 |
+
gem_xsum,4,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.00666835063292078
|
323 |
+
gem_xsum,4,DOC_tldr,rouge2_fmeasure,0.010845224152235416
|
324 |
+
gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010104068388385765
|
325 |
+
gem_xsum,4,summarize_DOC,rouge2_fmeasure,0.010522073701869125
|
326 |
+
gem_xsum,4,summarize_this_DOC_summary,rouge2_fmeasure,0.008786196844590121
|
327 |
+
gem_xsum,4,median,rouge2_fmeasure,0.010104068388385765
|
328 |
+
gem_xsum,5,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0
|
329 |
+
gem_xsum,5,DOC_tldr,rouge2_fmeasure,0.0003107051777238192
|
330 |
+
gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00041371259854665804
|
331 |
+
gem_xsum,5,summarize_DOC,rouge2_fmeasure,0.00046275158053195667
|
332 |
+
gem_xsum,5,summarize_this_DOC_summary,rouge2_fmeasure,6.352836541515787e-05
|
333 |
+
gem_xsum,5,median,rouge2_fmeasure,0.0003107051777238192
|
334 |
+
gem_xsum,5,average,multiple,0.018873918538956473
|
335 |
+
piqa,0,Correct the solution,rouge2_fmeasure,0.09706102035374112
|
336 |
+
piqa,0,choose the most appropriate solution,acc,0.49510337323177367
|
337 |
+
piqa,0,no prompt needed,rouge2_fmeasure,0.005928136888518339
|
338 |
+
piqa,0,pick_correct_choice_index,acc,0.49510337323177367
|
339 |
+
piqa,0,what_is_the_correct_ending,acc,0.5565832426550599
|
340 |
+
piqa,0,median,accuracy,0.49510337323177367
|
341 |
+
piqa,1,Correct the solution,rouge2_fmeasure,0.16839814753926893
|
342 |
+
piqa,1,choose the most appropriate solution,acc,0.5087051142546246
|
343 |
+
piqa,1,no prompt needed,rouge2_fmeasure,0.005682715949708656
|
344 |
+
piqa,1,pick_correct_choice_index,acc,0.5076169749727966
|
345 |
+
piqa,1,what_is_the_correct_ending,acc,0.5685527747551686
|
346 |
+
piqa,1,median,accuracy,0.5087051142546246
|
347 |
+
piqa,2,Correct the solution,rouge2_fmeasure,0.21700191007059494
|
348 |
+
piqa,2,choose the most appropriate solution,acc,0.5223068552774756
|
349 |
+
piqa,2,no prompt needed,rouge2_fmeasure,0.005621916396892083
|
350 |
+
piqa,2,pick_correct_choice_index,acc,0.5
|
351 |
+
piqa,2,what_is_the_correct_ending,acc,0.5718171926006529
|
352 |
+
piqa,2,median,accuracy,0.5223068552774756
|
353 |
+
piqa,3,Correct the solution,rouge2_fmeasure,0.2220313726729203
|
354 |
+
piqa,3,choose the most appropriate solution,acc,0.5092491838955386
|
355 |
+
piqa,3,no prompt needed,rouge2_fmeasure,0.005486989401606149
|
356 |
+
piqa,3,pick_correct_choice_index,acc,0.515778019586507
|
357 |
+
piqa,3,what_is_the_correct_ending,acc,0.5663764961915125
|
358 |
+
piqa,3,median,accuracy,0.515778019586507
|
359 |
+
piqa,4,Correct the solution,rouge2_fmeasure,0.21583669822052345
|
360 |
+
piqa,4,choose the most appropriate solution,acc,0.5282916213275299
|
361 |
+
piqa,4,no prompt needed,rouge2_fmeasure,0.005250361302057742
|
362 |
+
piqa,4,pick_correct_choice_index,acc,0.5228509249183896
|
363 |
+
piqa,4,what_is_the_correct_ending,acc,0.5865070729053319
|
364 |
+
piqa,4,median,accuracy,0.5282916213275299
|
365 |
+
piqa,5,Correct the solution,rouge2_fmeasure,0.20868674330105244
|
366 |
+
piqa,5,choose the most appropriate solution,acc,0.5114254624591947
|
367 |
+
piqa,5,no prompt needed,rouge2_fmeasure,0.005515135528910162
|
368 |
+
piqa,5,pick_correct_choice_index,acc,0.5021762785636561
|
369 |
+
piqa,5,what_is_the_correct_ending,acc,0.5848748639825898
|
370 |
+
piqa,5,median,accuracy,0.5114254624591947
|
371 |
+
piqa,5,average,multiple,0.5136017410228509
|
372 |
+
sciq,0,Direct Question,acc,0.83
|
373 |
+
sciq,0,Direct Question (Closed Book),acc,0.613
|
374 |
+
sciq,0,Multiple Choice,acc,0.342
|
375 |
+
sciq,0,Multiple Choice (Closed Book),acc,0.287
|
376 |
+
sciq,0,Multiple Choice Question First,acc,0.349
|
377 |
+
sciq,0,median,accuracy,0.349
|
378 |
+
sciq,1,Direct Question,acc,0.846
|
379 |
+
sciq,1,Direct Question (Closed Book),acc,0.663
|
380 |
+
sciq,1,Multiple Choice,acc,0.378
|
381 |
+
sciq,1,Multiple Choice (Closed Book),acc,0.378
|
382 |
+
sciq,1,Multiple Choice Question First,acc,0.392
|
383 |
+
sciq,1,median,accuracy,0.392
|
384 |
+
sciq,2,Direct Question,acc,0.853
|
385 |
+
sciq,2,Direct Question (Closed Book),acc,0.673
|
386 |
+
sciq,2,Multiple Choice,acc,0.344
|
387 |
+
sciq,2,Multiple Choice (Closed Book),acc,0.372
|
388 |
+
sciq,2,Multiple Choice Question First,acc,0.363
|
389 |
+
sciq,2,median,accuracy,0.372
|
390 |
+
sciq,3,Direct Question,acc,0.856
|
391 |
+
sciq,3,Direct Question (Closed Book),acc,0.662
|
392 |
+
sciq,3,Multiple Choice,acc,0.329
|
393 |
+
sciq,3,Multiple Choice (Closed Book),acc,0.349
|
394 |
+
sciq,3,Multiple Choice Question First,acc,0.363
|
395 |
+
sciq,3,median,accuracy,0.363
|
396 |
+
sciq,4,Direct Question,acc,0.849
|
397 |
+
sciq,4,Direct Question (Closed Book),acc,0.671
|
398 |
+
sciq,4,Multiple Choice,acc,0.335
|
399 |
+
sciq,4,Multiple Choice (Closed Book),acc,0.335
|
400 |
+
sciq,4,Multiple Choice Question First,acc,0.319
|
401 |
+
sciq,4,median,accuracy,0.335
|
402 |
+
sciq,5,Direct Question,acc,0.849
|
403 |
+
sciq,5,Direct Question (Closed Book),acc,0.682
|
404 |
+
sciq,5,Multiple Choice,acc,0.327
|
405 |
+
sciq,5,Multiple Choice (Closed Book),acc,0.362
|
406 |
+
sciq,5,Multiple Choice Question First,acc,0.333
|
407 |
+
sciq,5,median,accuracy,0.362
|
408 |
+
sciq,5,average,multiple,0.36216666666666664
|
409 |
+
story_cloze_2016,0,Answer Given options,acc,0.4719401389631213
|
410 |
+
story_cloze_2016,0,Choose Story Ending,acc,0.4906467129877071
|
411 |
+
story_cloze_2016,0,Novel Correct Ending,acc,0.4831640833778728
|
412 |
+
story_cloze_2016,0,Story Continuation and Options,acc,0.49706039551042225
|
413 |
+
story_cloze_2016,0,median,accuracy,0.48690539818279
|
414 |
+
story_cloze_2016,1,Answer Given options,acc,0.4521646178514164
|
415 |
+
story_cloze_2016,1,Choose Story Ending,acc,0.4596472474612507
|
416 |
+
story_cloze_2016,1,Novel Correct Ending,acc,0.4494922501336184
|
417 |
+
story_cloze_2016,1,Story Continuation and Options,acc,0.46392303580972744
|
418 |
+
story_cloze_2016,1,median,accuracy,0.4559059326563335
|
419 |
+
story_cloze_2016,2,Answer Given options,acc,0.4510956707642972
|
420 |
+
story_cloze_2016,2,Choose Story Ending,acc,0.4623196151790486
|
421 |
+
story_cloze_2016,2,Novel Correct Ending,acc,0.4478888295029396
|
422 |
+
story_cloze_2016,2,Story Continuation and Options,acc,0.45911277391769106
|
423 |
+
story_cloze_2016,2,median,accuracy,0.4551042223409941
|
424 |
+
story_cloze_2016,3,Answer Given options,acc,0.4665954035275254
|
425 |
+
story_cloze_2016,3,Choose Story Ending,acc,0.45269909139497594
|
426 |
+
story_cloze_2016,3,Novel Correct Ending,acc,0.4494922501336184
|
427 |
+
story_cloze_2016,3,Story Continuation and Options,acc,0.4521646178514164
|
428 |
+
story_cloze_2016,3,median,accuracy,0.45243185462319613
|
429 |
+
story_cloze_2016,4,Answer Given options,acc,0.45537145911277394
|
430 |
+
story_cloze_2016,4,Choose Story Ending,acc,0.46125066809192944
|
431 |
+
story_cloze_2016,4,Novel Correct Ending,acc,0.44200962052378406
|
432 |
+
story_cloze_2016,4,Story Continuation and Options,acc,0.4510956707642972
|
433 |
+
story_cloze_2016,4,median,accuracy,0.45323356493853556
|
434 |
+
story_cloze_2016,5,Answer Given options,acc,0.4665954035275254
|
435 |
+
story_cloze_2016,5,Choose Story Ending,acc,0.467129877071085
|
436 |
+
story_cloze_2016,5,Novel Correct Ending,acc,0.45056119722073756
|
437 |
+
story_cloze_2016,5,Story Continuation and Options,acc,0.4665954035275254
|
438 |
+
story_cloze_2016,5,median,accuracy,0.4665954035275254
|
439 |
+
story_cloze_2016,5,average,multiple,0.46169606271156244
|
440 |
+
superglue_rte,0,GPT-3 style,acc,0.5270758122743683
|
441 |
+
superglue_rte,0,MNLI crowdsource,acc,0.5342960288808665
|
442 |
+
superglue_rte,0,does it follow that,acc,0.5270758122743683
|
443 |
+
superglue_rte,0,guaranteed true,acc,0.5054151624548736
|
444 |
+
superglue_rte,0,should assume,acc,0.5415162454873647
|
445 |
+
superglue_rte,0,median,accuracy,0.5270758122743683
|
446 |
+
superglue_rte,1,GPT-3 style,acc,0.4729241877256318
|
447 |
+
superglue_rte,1,MNLI crowdsource,acc,0.49097472924187724
|
448 |
+
superglue_rte,1,does it follow that,acc,0.49097472924187724
|
449 |
+
superglue_rte,1,guaranteed true,acc,0.49097472924187724
|
450 |
+
superglue_rte,1,should assume,acc,0.49097472924187724
|
451 |
+
superglue_rte,1,median,accuracy,0.49097472924187724
|
452 |
+
superglue_rte,2,GPT-3 style,acc,0.51985559566787
|
453 |
+
superglue_rte,2,MNLI crowdsource,acc,0.51985559566787
|
454 |
+
superglue_rte,2,does it follow that,acc,0.5090252707581228
|
455 |
+
superglue_rte,2,guaranteed true,acc,0.5270758122743683
|
456 |
+
superglue_rte,2,should assume,acc,0.5090252707581228
|
457 |
+
superglue_rte,2,median,accuracy,0.51985559566787
|
458 |
+
superglue_rte,3,GPT-3 style,acc,0.5090252707581228
|
459 |
+
superglue_rte,3,MNLI crowdsource,acc,0.49097472924187724
|
460 |
+
superglue_rte,3,does it follow that,acc,0.48375451263537905
|
461 |
+
superglue_rte,3,guaranteed true,acc,0.516245487364621
|
462 |
+
superglue_rte,3,should assume,acc,0.5018050541516246
|
463 |
+
superglue_rte,3,median,accuracy,0.5018050541516246
|
464 |
+
superglue_rte,4,GPT-3 style,acc,0.4620938628158845
|
465 |
+
superglue_rte,4,MNLI crowdsource,acc,0.48736462093862815
|
466 |
+
superglue_rte,4,does it follow that,acc,0.48014440433212996
|
467 |
+
superglue_rte,4,guaranteed true,acc,0.5090252707581228
|
468 |
+
superglue_rte,4,should assume,acc,0.48014440433212996
|
469 |
+
superglue_rte,4,median,accuracy,0.48014440433212996
|
470 |
+
superglue_rte,5,GPT-3 style,acc,0.4548736462093863
|
471 |
+
superglue_rte,5,MNLI crowdsource,acc,0.4693140794223827
|
472 |
+
superglue_rte,5,does it follow that,acc,0.4981949458483754
|
473 |
+
superglue_rte,5,guaranteed true,acc,0.4693140794223827
|
474 |
+
superglue_rte,5,should assume,acc,0.4729241877256318
|
475 |
+
superglue_rte,5,median,accuracy,0.4693140794223827
|
476 |
+
superglue_rte,5,average,multiple,0.4981949458483754
|
477 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.0532813862747049
|
478 |
+
web_nlg_en,0,explicit-graph-description2,rouge2_fmeasure,0.012985384177633208
|
479 |
+
web_nlg_en,0,implicit-graph-description,rouge2_fmeasure,0.0019179536475281184
|
480 |
+
web_nlg_en,0,non-explicit-description,rouge2_fmeasure,0.004150191718708099
|
481 |
+
web_nlg_en,0,very-explicit-description,rouge2_fmeasure,6.345797512857661e-05
|
482 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.004150191718708099
|
483 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05368591058094131
|
484 |
+
web_nlg_en,1,explicit-graph-description2,rouge2_fmeasure,0.04640292562526275
|
485 |
+
web_nlg_en,1,implicit-graph-description,rouge2_fmeasure,0.025426464984268635
|
486 |
+
web_nlg_en,1,non-explicit-description,rouge2_fmeasure,0.054485621293343514
|
487 |
+
web_nlg_en,1,very-explicit-description,rouge2_fmeasure,0.051815361827752766
|
488 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.051815361827752766
|
489 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05344291957030947
|
490 |
+
web_nlg_en,2,explicit-graph-description2,rouge2_fmeasure,0.07757017845101091
|
491 |
+
web_nlg_en,2,implicit-graph-description,rouge2_fmeasure,0.032591510976486694
|
492 |
+
web_nlg_en,2,non-explicit-description,rouge2_fmeasure,0.061645677874947354
|
493 |
+
web_nlg_en,2,very-explicit-description,rouge2_fmeasure,0.0538074768484528
|
494 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.0538074768484528
|
495 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05368996382308088
|
496 |
+
web_nlg_en,3,explicit-graph-description2,rouge2_fmeasure,0.06808437331115559
|
497 |
+
web_nlg_en,3,implicit-graph-description,rouge2_fmeasure,0.036709719893509046
|
498 |
+
web_nlg_en,3,non-explicit-description,rouge2_fmeasure,0.06571024213935271
|
499 |
+
web_nlg_en,3,very-explicit-description,rouge2_fmeasure,0.0579898457998029
|
500 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.0579898457998029
|
501 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.0515680827205002
|
502 |
+
web_nlg_en,4,explicit-graph-description2,rouge2_fmeasure,0.052457179235399276
|
503 |
+
web_nlg_en,4,implicit-graph-description,rouge2_fmeasure,0.036402813498665906
|
504 |
+
web_nlg_en,4,non-explicit-description,rouge2_fmeasure,0.06374220282296517
|
505 |
+
web_nlg_en,4,very-explicit-description,rouge2_fmeasure,0.05793797811823835
|
506 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.052457179235399276
|
507 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05107734688924233
|
508 |
+
web_nlg_en,5,explicit-graph-description2,rouge2_fmeasure,0.045075512409701604
|
509 |
+
web_nlg_en,5,implicit-graph-description,rouge2_fmeasure,0.03482971179628577
|
510 |
+
web_nlg_en,5,non-explicit-description,rouge2_fmeasure,0.0639493149144395
|
511 |
+
web_nlg_en,5,very-explicit-description,rouge2_fmeasure,0.05600461944766409
|
512 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.05107734688924233
|
513 |
+
web_nlg_en,5,average,multiple,0.04521623371989303
|
514 |
+
wiki_lingua_en,0,article_summary_en,rouge2_fmeasure,0.009594517812957653
|
515 |
+
wiki_lingua_en,0,rephrase_en,rouge2_fmeasure,0.003243321779952968
|
516 |
+
wiki_lingua_en,0,summarize_above_en,rouge2_fmeasure,0.0042667329498244436
|
517 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.002874313185982406
|
518 |
+
wiki_lingua_en,0,write_abstract_en,rouge2_fmeasure,0.0011035986294212138
|
519 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.003243321779952968
|
520 |
+
wiki_lingua_en,1,article_summary_en,rouge2_fmeasure,0.017846850141455827
|
521 |
+
wiki_lingua_en,1,rephrase_en,rouge2_fmeasure,0.010181112623842817
|
522 |
+
wiki_lingua_en,1,summarize_above_en,rouge2_fmeasure,0.00423567615381497
|
523 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.028190707681194575
|
524 |
+
wiki_lingua_en,1,write_abstract_en,rouge2_fmeasure,0.010646298254836605
|
525 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.010646298254836605
|
526 |
+
wiki_lingua_en,2,article_summary_en,rouge2_fmeasure,0.022535640055881916
|
527 |
+
wiki_lingua_en,2,rephrase_en,rouge2_fmeasure,0.02117387153026309
|
528 |
+
wiki_lingua_en,2,summarize_above_en,rouge2_fmeasure,0.004697153886380661
|
529 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04456119604899187
|
530 |
+
wiki_lingua_en,2,write_abstract_en,rouge2_fmeasure,0.02129586388647884
|
531 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.02129586388647884
|
532 |
+
wiki_lingua_en,3,article_summary_en,rouge2_fmeasure,0.021648290209856712
|
533 |
+
wiki_lingua_en,3,rephrase_en,rouge2_fmeasure,0.022197025590925616
|
534 |
+
wiki_lingua_en,3,summarize_above_en,rouge2_fmeasure,0.0040651974203171634
|
535 |
+
wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.03887583188926559
|
536 |
+
wiki_lingua_en,3,write_abstract_en,rouge2_fmeasure,0.021617853652187335
|
537 |
+
wiki_lingua_en,3,median,rouge2_fmeasure,0.021648290209856712
|
538 |
+
wiki_lingua_en,4,article_summary_en,rouge2_fmeasure,0.007536808369783641
|
539 |
+
wiki_lingua_en,4,rephrase_en,rouge2_fmeasure,0.008171833643724272
|
540 |
+
wiki_lingua_en,4,summarize_above_en,rouge2_fmeasure,0.0018254913452193152
|
541 |
+
wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.013407675922368708
|
542 |
+
wiki_lingua_en,4,write_abstract_en,rouge2_fmeasure,0.00641884861944969
|
543 |
+
wiki_lingua_en,4,median,rouge2_fmeasure,0.007536808369783641
|
544 |
+
wiki_lingua_en,5,article_summary_en,rouge2_fmeasure,0.0011309269927620334
|
545 |
+
wiki_lingua_en,5,rephrase_en,rouge2_fmeasure,0.0012168332924537228
|
546 |
+
wiki_lingua_en,5,summarize_above_en,rouge2_fmeasure,0.00026468573039586365
|
547 |
+
wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0020845828252393957
|
548 |
+
wiki_lingua_en,5,write_abstract_en,rouge2_fmeasure,0.0006554801175224404
|
549 |
+
wiki_lingua_en,5,median,rouge2_fmeasure,0.0011309269927620334
|
550 |
+
wiki_lingua_en,5,average,multiple,0.010916918248945133
|
551 |
+
winogrande,0,Replace,acc,0.5059194948697711
|
552 |
+
winogrande,0,True or False,acc,0.494869771112865
|
553 |
+
winogrande,0,does underscore refer to,acc,0.4964483030781373
|
554 |
+
winogrande,0,stand for,acc,0.5098658247829518
|
555 |
+
winogrande,0,underscore refer to,acc,0.5177584846093133
|
556 |
+
winogrande,0,median,accuracy,0.5059194948697711
|
557 |
+
winogrande,1,Replace,acc,0.5114443567482242
|
558 |
+
winogrande,1,True or False,acc,0.494869771112865
|
559 |
+
winogrande,1,does underscore refer to,acc,0.49329123914759276
|
560 |
+
winogrande,1,stand for,acc,0.5090765588003157
|
561 |
+
winogrande,1,underscore refer to,acc,0.4964483030781373
|
562 |
+
winogrande,1,median,accuracy,0.4964483030781373
|
563 |
+
winogrande,2,Replace,acc,0.5043409629044988
|
564 |
+
winogrande,2,True or False,acc,0.49329123914759276
|
565 |
+
winogrande,2,does underscore refer to,acc,0.49171270718232046
|
566 |
+
winogrande,2,stand for,acc,0.49329123914759276
|
567 |
+
winogrande,2,underscore refer to,acc,0.5019731649565904
|
568 |
+
winogrande,2,median,accuracy,0.49329123914759276
|
569 |
+
winogrande,3,Replace,acc,0.5059194948697711
|
570 |
+
winogrande,3,True or False,acc,0.4988161010260458
|
571 |
+
winogrande,3,does underscore refer to,acc,0.48855564325177586
|
572 |
+
winogrande,3,stand for,acc,0.4980268350434096
|
573 |
+
winogrande,3,underscore refer to,acc,0.5209155485398579
|
574 |
+
winogrande,3,median,accuracy,0.4988161010260458
|
575 |
+
winogrande,4,Replace,acc,0.5019731649565904
|
576 |
+
winogrande,4,True or False,acc,0.5098658247829518
|
577 |
+
winogrande,4,does underscore refer to,acc,0.4877663772691397
|
578 |
+
winogrande,4,stand for,acc,0.4980268350434096
|
579 |
+
winogrande,4,underscore refer to,acc,0.5193370165745856
|
580 |
+
winogrande,4,median,accuracy,0.5019731649565904
|
581 |
+
winogrande,5,Replace,acc,0.4956590370955012
|
582 |
+
winogrande,5,True or False,acc,0.5019731649565904
|
583 |
+
winogrande,5,does underscore refer to,acc,0.4925019731649566
|
584 |
+
winogrande,5,stand for,acc,0.489344909234412
|
585 |
+
winogrande,5,underscore refer to,acc,0.5090765588003157
|
586 |
+
winogrande,5,median,accuracy,0.4956590370955012
|
587 |
+
winogrande,5,average,multiple,0.4986845566956064
|
4b284b12bc4/eval/merged.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
4b284b17bc4/eval/merged.csv
ADDED
@@ -0,0 +1,587 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
anli_r1,0,GPT-3 style,acc,0.334
|
3 |
+
anli_r1,0,MNLI crowdsource,acc,0.337
|
4 |
+
anli_r1,0,can we infer,acc,0.335
|
5 |
+
anli_r1,0,guaranteed/possible/impossible,acc,0.349
|
6 |
+
anli_r1,0,justified in saying,acc,0.339
|
7 |
+
anli_r1,0,median,accuracy,0.337
|
8 |
+
anli_r1,1,GPT-3 style,acc,0.324
|
9 |
+
anli_r1,1,MNLI crowdsource,acc,0.332
|
10 |
+
anli_r1,1,can we infer,acc,0.333
|
11 |
+
anli_r1,1,guaranteed/possible/impossible,acc,0.333
|
12 |
+
anli_r1,1,justified in saying,acc,0.332
|
13 |
+
anli_r1,1,median,accuracy,0.332
|
14 |
+
anli_r1,2,GPT-3 style,acc,0.352
|
15 |
+
anli_r1,2,MNLI crowdsource,acc,0.337
|
16 |
+
anli_r1,2,can we infer,acc,0.345
|
17 |
+
anli_r1,2,guaranteed/possible/impossible,acc,0.33
|
18 |
+
anli_r1,2,justified in saying,acc,0.356
|
19 |
+
anli_r1,2,median,accuracy,0.345
|
20 |
+
anli_r1,3,GPT-3 style,acc,0.343
|
21 |
+
anli_r1,3,MNLI crowdsource,acc,0.352
|
22 |
+
anli_r1,3,can we infer,acc,0.359
|
23 |
+
anli_r1,3,guaranteed/possible/impossible,acc,0.325
|
24 |
+
anli_r1,3,justified in saying,acc,0.35
|
25 |
+
anli_r1,3,median,accuracy,0.35
|
26 |
+
anli_r1,4,GPT-3 style,acc,0.338
|
27 |
+
anli_r1,4,MNLI crowdsource,acc,0.341
|
28 |
+
anli_r1,4,can we infer,acc,0.337
|
29 |
+
anli_r1,4,guaranteed/possible/impossible,acc,0.331
|
30 |
+
anli_r1,4,justified in saying,acc,0.328
|
31 |
+
anli_r1,4,median,accuracy,0.337
|
32 |
+
anli_r1,5,GPT-3 style,acc,0.348
|
33 |
+
anli_r1,5,MNLI crowdsource,acc,0.356
|
34 |
+
anli_r1,5,can we infer,acc,0.337
|
35 |
+
anli_r1,5,guaranteed/possible/impossible,acc,0.333
|
36 |
+
anli_r1,5,justified in saying,acc,0.327
|
37 |
+
anli_r1,5,median,accuracy,0.337
|
38 |
+
anli_r1,5,average,multiple,0.3396666666666667
|
39 |
+
anli_r2,0,GPT-3 style,acc,0.333
|
40 |
+
anli_r2,0,MNLI crowdsource,acc,0.325
|
41 |
+
anli_r2,0,can we infer,acc,0.332
|
42 |
+
anli_r2,0,guaranteed/possible/impossible,acc,0.311
|
43 |
+
anli_r2,0,justified in saying,acc,0.333
|
44 |
+
anli_r2,0,median,accuracy,0.332
|
45 |
+
anli_r2,1,GPT-3 style,acc,0.314
|
46 |
+
anli_r2,1,MNLI crowdsource,acc,0.319
|
47 |
+
anli_r2,1,can we infer,acc,0.315
|
48 |
+
anli_r2,1,guaranteed/possible/impossible,acc,0.315
|
49 |
+
anli_r2,1,justified in saying,acc,0.32
|
50 |
+
anli_r2,1,median,accuracy,0.315
|
51 |
+
anli_r2,2,GPT-3 style,acc,0.334
|
52 |
+
anli_r2,2,MNLI crowdsource,acc,0.339
|
53 |
+
anli_r2,2,can we infer,acc,0.323
|
54 |
+
anli_r2,2,guaranteed/possible/impossible,acc,0.335
|
55 |
+
anli_r2,2,justified in saying,acc,0.322
|
56 |
+
anli_r2,2,median,accuracy,0.334
|
57 |
+
anli_r2,3,GPT-3 style,acc,0.325
|
58 |
+
anli_r2,3,MNLI crowdsource,acc,0.314
|
59 |
+
anli_r2,3,can we infer,acc,0.321
|
60 |
+
anli_r2,3,guaranteed/possible/impossible,acc,0.335
|
61 |
+
anli_r2,3,justified in saying,acc,0.322
|
62 |
+
anli_r2,3,median,accuracy,0.322
|
63 |
+
anli_r2,4,GPT-3 style,acc,0.311
|
64 |
+
anli_r2,4,MNLI crowdsource,acc,0.303
|
65 |
+
anli_r2,4,can we infer,acc,0.332
|
66 |
+
anli_r2,4,guaranteed/possible/impossible,acc,0.333
|
67 |
+
anli_r2,4,justified in saying,acc,0.331
|
68 |
+
anli_r2,4,median,accuracy,0.331
|
69 |
+
anli_r2,5,GPT-3 style,acc,0.313
|
70 |
+
anli_r2,5,MNLI crowdsource,acc,0.305
|
71 |
+
anli_r2,5,can we infer,acc,0.326
|
72 |
+
anli_r2,5,guaranteed/possible/impossible,acc,0.333
|
73 |
+
anli_r2,5,justified in saying,acc,0.319
|
74 |
+
anli_r2,5,median,accuracy,0.319
|
75 |
+
anli_r2,5,average,multiple,0.3255
|
76 |
+
anli_r3,0,GPT-3 style,acc,0.33416666666666667
|
77 |
+
anli_r3,0,MNLI crowdsource,acc,0.33416666666666667
|
78 |
+
anli_r3,0,can we infer,acc,0.32666666666666666
|
79 |
+
anli_r3,0,guaranteed/possible/impossible,acc,0.31583333333333335
|
80 |
+
anli_r3,0,justified in saying,acc,0.3358333333333333
|
81 |
+
anli_r3,0,median,accuracy,0.33416666666666667
|
82 |
+
anli_r3,1,GPT-3 style,acc,0.3275
|
83 |
+
anli_r3,1,MNLI crowdsource,acc,0.3333333333333333
|
84 |
+
anli_r3,1,can we infer,acc,0.33666666666666667
|
85 |
+
anli_r3,1,guaranteed/possible/impossible,acc,0.33666666666666667
|
86 |
+
anli_r3,1,justified in saying,acc,0.33416666666666667
|
87 |
+
anli_r3,1,median,accuracy,0.33416666666666667
|
88 |
+
anli_r3,2,GPT-3 style,acc,0.33
|
89 |
+
anli_r3,2,MNLI crowdsource,acc,0.31583333333333335
|
90 |
+
anli_r3,2,can we infer,acc,0.31166666666666665
|
91 |
+
anli_r3,2,guaranteed/possible/impossible,acc,0.33166666666666667
|
92 |
+
anli_r3,2,justified in saying,acc,0.32166666666666666
|
93 |
+
anli_r3,2,median,accuracy,0.32166666666666666
|
94 |
+
anli_r3,3,GPT-3 style,acc,0.335
|
95 |
+
anli_r3,3,MNLI crowdsource,acc,0.3333333333333333
|
96 |
+
anli_r3,3,can we infer,acc,0.3333333333333333
|
97 |
+
anli_r3,3,guaranteed/possible/impossible,acc,0.32916666666666666
|
98 |
+
anli_r3,3,justified in saying,acc,0.3383333333333333
|
99 |
+
anli_r3,3,median,accuracy,0.3333333333333333
|
100 |
+
anli_r3,4,GPT-3 style,acc,0.31666666666666665
|
101 |
+
anli_r3,4,MNLI crowdsource,acc,0.31666666666666665
|
102 |
+
anli_r3,4,can we infer,acc,0.31916666666666665
|
103 |
+
anli_r3,4,guaranteed/possible/impossible,acc,0.3425
|
104 |
+
anli_r3,4,justified in saying,acc,0.3275
|
105 |
+
anli_r3,4,median,accuracy,0.31916666666666665
|
106 |
+
anli_r3,5,GPT-3 style,acc,0.3308333333333333
|
107 |
+
anli_r3,5,MNLI crowdsource,acc,0.315
|
108 |
+
anli_r3,5,can we infer,acc,0.3225
|
109 |
+
anli_r3,5,guaranteed/possible/impossible,acc,0.33416666666666667
|
110 |
+
anli_r3,5,justified in saying,acc,0.31833333333333336
|
111 |
+
anli_r3,5,median,accuracy,0.3225
|
112 |
+
anli_r3,5,average,multiple,0.3275
|
113 |
+
arc_easy,0,heres_a_problem,acc,0.23274410774410775
|
114 |
+
arc_easy,0,i_am_hesitating,acc,0.26706484641638223
|
115 |
+
arc_easy,0,multiple_choice,acc,0.2958754208754209
|
116 |
+
arc_easy,0,pick_the_most_correct_option,acc,0.2295221843003413
|
117 |
+
arc_easy,0,qa_options,acc,0.35269360269360267
|
118 |
+
arc_easy,0,median,accuracy,0.26706484641638223
|
119 |
+
arc_easy,1,heres_a_problem,acc,0.23208191126279865
|
120 |
+
arc_easy,1,i_am_hesitating,acc,0.2790102389078498
|
121 |
+
arc_easy,1,multiple_choice,acc,0.30303030303030304
|
122 |
+
arc_easy,1,pick_the_most_correct_option,acc,0.22440273037542663
|
123 |
+
arc_easy,1,qa_options,acc,0.26621160409556316
|
124 |
+
arc_easy,1,median,accuracy,0.26621160409556316
|
125 |
+
arc_easy,2,heres_a_problem,acc,0.22013651877133106
|
126 |
+
arc_easy,2,i_am_hesitating,acc,0.33207070707070707
|
127 |
+
arc_easy,2,multiple_choice,acc,0.2431740614334471
|
128 |
+
arc_easy,2,pick_the_most_correct_option,acc,0.21928327645051193
|
129 |
+
arc_easy,2,qa_options,acc,0.3409090909090909
|
130 |
+
arc_easy,2,median,accuracy,0.2431740614334471
|
131 |
+
arc_easy,3,heres_a_problem,acc,0.24368686868686867
|
132 |
+
arc_easy,3,i_am_hesitating,acc,0.2508532423208191
|
133 |
+
arc_easy,3,multiple_choice,acc,0.3202861952861953
|
134 |
+
arc_easy,3,pick_the_most_correct_option,acc,0.24494949494949494
|
135 |
+
arc_easy,3,qa_options,acc,0.26023890784982934
|
136 |
+
arc_easy,3,median,accuracy,0.2508532423208191
|
137 |
+
arc_easy,4,heres_a_problem,acc,0.23863636363636365
|
138 |
+
arc_easy,4,i_am_hesitating,acc,0.3400673400673401
|
139 |
+
arc_easy,4,multiple_choice,acc,0.30134680134680136
|
140 |
+
arc_easy,4,pick_the_most_correct_option,acc,0.2354948805460751
|
141 |
+
arc_easy,4,qa_options,acc,0.3287037037037037
|
142 |
+
arc_easy,4,median,accuracy,0.30134680134680136
|
143 |
+
arc_easy,5,heres_a_problem,acc,0.23208191126279865
|
144 |
+
arc_easy,5,i_am_hesitating,acc,0.335016835016835
|
145 |
+
arc_easy,5,multiple_choice,acc,0.24146757679180889
|
146 |
+
arc_easy,5,pick_the_most_correct_option,acc,0.23378839590443687
|
147 |
+
arc_easy,5,qa_options,acc,0.25170648464163825
|
148 |
+
arc_easy,5,median,accuracy,0.24146757679180889
|
149 |
+
arc_easy,5,average,multiple,0.26168635540080365
|
150 |
+
boolq,0,GPT-3 Style,acc,0.5496666666666666
|
151 |
+
boolq,0,after_reading,acc,0.6233333333333333
|
152 |
+
boolq,0,exercise,acc,0.6236666666666667
|
153 |
+
boolq,0,valid_binary,acc,0.611
|
154 |
+
boolq,0,yes_no_question,acc,0.606
|
155 |
+
boolq,0,median,accuracy,0.611
|
156 |
+
boolq,1,GPT-3 Style,acc,0.56
|
157 |
+
boolq,1,after_reading,acc,0.5856666666666667
|
158 |
+
boolq,1,exercise,acc,0.5576666666666666
|
159 |
+
boolq,1,valid_binary,acc,0.6203333333333333
|
160 |
+
boolq,1,yes_no_question,acc,0.5746666666666667
|
161 |
+
boolq,1,median,accuracy,0.5746666666666667
|
162 |
+
boolq,2,GPT-3 Style,acc,0.58
|
163 |
+
boolq,2,after_reading,acc,0.6053333333333333
|
164 |
+
boolq,2,exercise,acc,0.5663333333333334
|
165 |
+
boolq,2,valid_binary,acc,0.623
|
166 |
+
boolq,2,yes_no_question,acc,0.5926666666666667
|
167 |
+
boolq,2,median,accuracy,0.5926666666666667
|
168 |
+
boolq,3,GPT-3 Style,acc,0.5823333333333334
|
169 |
+
boolq,3,after_reading,acc,0.6026666666666667
|
170 |
+
boolq,3,exercise,acc,0.5706666666666667
|
171 |
+
boolq,3,valid_binary,acc,0.6233333333333333
|
172 |
+
boolq,3,yes_no_question,acc,0.595
|
173 |
+
boolq,3,median,accuracy,0.595
|
174 |
+
boolq,4,GPT-3 Style,acc,0.587
|
175 |
+
boolq,4,after_reading,acc,0.6043333333333333
|
176 |
+
boolq,4,exercise,acc,0.5726666666666667
|
177 |
+
boolq,4,valid_binary,acc,0.621
|
178 |
+
boolq,4,yes_no_question,acc,0.577
|
179 |
+
boolq,4,median,accuracy,0.587
|
180 |
+
boolq,5,GPT-3 Style,acc,0.5886666666666667
|
181 |
+
boolq,5,after_reading,acc,0.604
|
182 |
+
boolq,5,exercise,acc,0.5673333333333334
|
183 |
+
boolq,5,valid_binary,acc,0.6223333333333333
|
184 |
+
boolq,5,yes_no_question,acc,0.5723333333333334
|
185 |
+
boolq,5,median,accuracy,0.5886666666666667
|
186 |
+
boolq,5,average,multiple,0.5915
|
187 |
+
cb,0,GPT-3 style,acc,0.4107142857142857
|
188 |
+
cb,0,MNLI crowdsource,acc,0.42857142857142855
|
189 |
+
cb,0,can we infer,acc,0.375
|
190 |
+
cb,0,guaranteed/possible/impossible,acc,0.42857142857142855
|
191 |
+
cb,0,justified in saying,acc,0.42857142857142855
|
192 |
+
cb,0,median,accuracy,0.42857142857142855
|
193 |
+
cb,1,GPT-3 style,acc,0.3392857142857143
|
194 |
+
cb,1,MNLI crowdsource,acc,0.39285714285714285
|
195 |
+
cb,1,can we infer,acc,0.39285714285714285
|
196 |
+
cb,1,guaranteed/possible/impossible,acc,0.375
|
197 |
+
cb,1,justified in saying,acc,0.375
|
198 |
+
cb,1,median,accuracy,0.375
|
199 |
+
cb,2,GPT-3 style,acc,0.4107142857142857
|
200 |
+
cb,2,MNLI crowdsource,acc,0.375
|
201 |
+
cb,2,can we infer,acc,0.375
|
202 |
+
cb,2,guaranteed/possible/impossible,acc,0.16071428571428573
|
203 |
+
cb,2,justified in saying,acc,0.44642857142857145
|
204 |
+
cb,2,median,accuracy,0.375
|
205 |
+
cb,3,GPT-3 style,acc,0.4107142857142857
|
206 |
+
cb,3,MNLI crowdsource,acc,0.3392857142857143
|
207 |
+
cb,3,can we infer,acc,0.4642857142857143
|
208 |
+
cb,3,guaranteed/possible/impossible,acc,0.16071428571428573
|
209 |
+
cb,3,justified in saying,acc,0.4107142857142857
|
210 |
+
cb,3,median,accuracy,0.4107142857142857
|
211 |
+
cb,4,GPT-3 style,acc,0.35714285714285715
|
212 |
+
cb,4,MNLI crowdsource,acc,0.35714285714285715
|
213 |
+
cb,4,can we infer,acc,0.4642857142857143
|
214 |
+
cb,4,guaranteed/possible/impossible,acc,0.14285714285714285
|
215 |
+
cb,4,justified in saying,acc,0.48214285714285715
|
216 |
+
cb,4,median,accuracy,0.35714285714285715
|
217 |
+
cb,5,GPT-3 style,acc,0.35714285714285715
|
218 |
+
cb,5,MNLI crowdsource,acc,0.4107142857142857
|
219 |
+
cb,5,can we infer,acc,0.5
|
220 |
+
cb,5,guaranteed/possible/impossible,acc,0.16071428571428573
|
221 |
+
cb,5,justified in saying,acc,0.4107142857142857
|
222 |
+
cb,5,median,accuracy,0.4107142857142857
|
223 |
+
cb,5,average,multiple,0.39285714285714285
|
224 |
+
copa,0,best_option,acc,0.55
|
225 |
+
copa,0,cause_effect,acc,0.54
|
226 |
+
copa,0,choose,acc,0.57
|
227 |
+
copa,0,i_am_hesitating,acc,0.54
|
228 |
+
copa,0,plausible_alternatives,acc,0.57
|
229 |
+
copa,0,median,accuracy,0.55
|
230 |
+
copa,1,best_option,acc,0.48
|
231 |
+
copa,1,cause_effect,acc,0.44
|
232 |
+
copa,1,choose,acc,0.46
|
233 |
+
copa,1,i_am_hesitating,acc,0.46
|
234 |
+
copa,1,plausible_alternatives,acc,0.41
|
235 |
+
copa,1,median,accuracy,0.46
|
236 |
+
copa,2,best_option,acc,0.42
|
237 |
+
copa,2,cause_effect,acc,0.41
|
238 |
+
copa,2,choose,acc,0.4
|
239 |
+
copa,2,i_am_hesitating,acc,0.4
|
240 |
+
copa,2,plausible_alternatives,acc,0.39
|
241 |
+
copa,2,median,accuracy,0.4
|
242 |
+
copa,3,best_option,acc,0.46
|
243 |
+
copa,3,cause_effect,acc,0.42
|
244 |
+
copa,3,choose,acc,0.4
|
245 |
+
copa,3,i_am_hesitating,acc,0.42
|
246 |
+
copa,3,plausible_alternatives,acc,0.43
|
247 |
+
copa,3,median,accuracy,0.42
|
248 |
+
copa,4,best_option,acc,0.47
|
249 |
+
copa,4,cause_effect,acc,0.39
|
250 |
+
copa,4,choose,acc,0.46
|
251 |
+
copa,4,i_am_hesitating,acc,0.41
|
252 |
+
copa,4,plausible_alternatives,acc,0.43
|
253 |
+
copa,4,median,accuracy,0.43
|
254 |
+
copa,5,best_option,acc,0.46
|
255 |
+
copa,5,cause_effect,acc,0.43
|
256 |
+
copa,5,choose,acc,0.45
|
257 |
+
copa,5,i_am_hesitating,acc,0.41
|
258 |
+
copa,5,plausible_alternatives,acc,0.45
|
259 |
+
copa,5,median,accuracy,0.45
|
260 |
+
copa,5,average,multiple,0.45166666666666666
|
261 |
+
e2e_nlg_cleaned,0,coherent_text,rouge2_fmeasure,0.09892905722529392
|
262 |
+
e2e_nlg_cleaned,0,create_text_for_me,rouge2_fmeasure,0.02500994962430241
|
263 |
+
e2e_nlg_cleaned,0,generate_gramatically_correct_text,rouge2_fmeasure,0.0
|
264 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.004707141554710639
|
265 |
+
e2e_nlg_cleaned,0,text,rouge2_fmeasure,0.1073793884770636
|
266 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.02500994962430241
|
267 |
+
e2e_nlg_cleaned,1,coherent_text,rouge2_fmeasure,0.1571930586851638
|
268 |
+
e2e_nlg_cleaned,1,create_text_for_me,rouge2_fmeasure,0.16340881202208163
|
269 |
+
e2e_nlg_cleaned,1,generate_gramatically_correct_text,rouge2_fmeasure,0.030860375813935463
|
270 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.1486663277769484
|
271 |
+
e2e_nlg_cleaned,1,text,rouge2_fmeasure,0.19855266031915028
|
272 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1571930586851638
|
273 |
+
e2e_nlg_cleaned,2,coherent_text,rouge2_fmeasure,0.16543669174208636
|
274 |
+
e2e_nlg_cleaned,2,create_text_for_me,rouge2_fmeasure,0.17467867245016275
|
275 |
+
e2e_nlg_cleaned,2,generate_gramatically_correct_text,rouge2_fmeasure,0.062337691922640125
|
276 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.1507673483604289
|
277 |
+
e2e_nlg_cleaned,2,text,rouge2_fmeasure,0.20344292743727435
|
278 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.16543669174208636
|
279 |
+
e2e_nlg_cleaned,3,coherent_text,rouge2_fmeasure,0.1703052547809578
|
280 |
+
e2e_nlg_cleaned,3,create_text_for_me,rouge2_fmeasure,0.17648552604551038
|
281 |
+
e2e_nlg_cleaned,3,generate_gramatically_correct_text,rouge2_fmeasure,0.08500284986690841
|
282 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.158839720125521
|
283 |
+
e2e_nlg_cleaned,3,text,rouge2_fmeasure,0.20463135769763866
|
284 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.1703052547809578
|
285 |
+
e2e_nlg_cleaned,4,coherent_text,rouge2_fmeasure,0.1738253944796353
|
286 |
+
e2e_nlg_cleaned,4,create_text_for_me,rouge2_fmeasure,0.17525069265082474
|
287 |
+
e2e_nlg_cleaned,4,generate_gramatically_correct_text,rouge2_fmeasure,0.1009343907225879
|
288 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.16013598883167798
|
289 |
+
e2e_nlg_cleaned,4,text,rouge2_fmeasure,0.19812463968549573
|
290 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.1738253944796353
|
291 |
+
e2e_nlg_cleaned,5,coherent_text,rouge2_fmeasure,0.17329021394802077
|
292 |
+
e2e_nlg_cleaned,5,create_text_for_me,rouge2_fmeasure,0.17585670830781294
|
293 |
+
e2e_nlg_cleaned,5,generate_gramatically_correct_text,rouge2_fmeasure,0.11077316594795349
|
294 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.1648812739511937
|
295 |
+
e2e_nlg_cleaned,5,text,rouge2_fmeasure,0.1945014895681582
|
296 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.17329021394802077
|
297 |
+
e2e_nlg_cleaned,5,average,multiple,0.14417676054336107
|
298 |
+
gem_xsum,0,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0190566029197429
|
299 |
+
gem_xsum,0,DOC_tldr,rouge2_fmeasure,0.05349637631115593
|
300 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.050742967235947956
|
301 |
+
gem_xsum,0,summarize_DOC,rouge2_fmeasure,0.040126835769534804
|
302 |
+
gem_xsum,0,summarize_this_DOC_summary,rouge2_fmeasure,0.05124089074244038
|
303 |
+
gem_xsum,0,median,rouge2_fmeasure,0.050742967235947956
|
304 |
+
gem_xsum,1,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.01908469081931983
|
305 |
+
gem_xsum,1,DOC_tldr,rouge2_fmeasure,0.051542487477497304
|
306 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03956915695649403
|
307 |
+
gem_xsum,1,summarize_DOC,rouge2_fmeasure,0.04851112854401421
|
308 |
+
gem_xsum,1,summarize_this_DOC_summary,rouge2_fmeasure,0.03871722499957788
|
309 |
+
gem_xsum,1,median,rouge2_fmeasure,0.03956915695649403
|
310 |
+
gem_xsum,2,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.026322118720045605
|
311 |
+
gem_xsum,2,DOC_tldr,rouge2_fmeasure,0.05337831779753894
|
312 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.040730679478674064
|
313 |
+
gem_xsum,2,summarize_DOC,rouge2_fmeasure,0.04736923476037229
|
314 |
+
gem_xsum,2,summarize_this_DOC_summary,rouge2_fmeasure,0.038463882894735665
|
315 |
+
gem_xsum,2,median,rouge2_fmeasure,0.040730679478674064
|
316 |
+
gem_xsum,3,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.03227615271942288
|
317 |
+
gem_xsum,3,DOC_tldr,rouge2_fmeasure,0.052933674983345634
|
318 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03767895922224648
|
319 |
+
gem_xsum,3,summarize_DOC,rouge2_fmeasure,0.04353291741965738
|
320 |
+
gem_xsum,3,summarize_this_DOC_summary,rouge2_fmeasure,0.0361588906854937
|
321 |
+
gem_xsum,3,median,rouge2_fmeasure,0.03767895922224648
|
322 |
+
gem_xsum,4,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.00873416530365632
|
323 |
+
gem_xsum,4,DOC_tldr,rouge2_fmeasure,0.01418426797251855
|
324 |
+
gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01035393012550112
|
325 |
+
gem_xsum,4,summarize_DOC,rouge2_fmeasure,0.011212198666180598
|
326 |
+
gem_xsum,4,summarize_this_DOC_summary,rouge2_fmeasure,0.009582916301059853
|
327 |
+
gem_xsum,4,median,rouge2_fmeasure,0.01035393012550112
|
328 |
+
gem_xsum,5,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0
|
329 |
+
gem_xsum,5,DOC_tldr,rouge2_fmeasure,0.000325473526945072
|
330 |
+
gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0003555930988203656
|
331 |
+
gem_xsum,5,summarize_DOC,rouge2_fmeasure,0.0005008107704990395
|
332 |
+
gem_xsum,5,summarize_this_DOC_summary,rouge2_fmeasure,0.0
|
333 |
+
gem_xsum,5,median,rouge2_fmeasure,0.000325473526945072
|
334 |
+
gem_xsum,5,average,multiple,0.029900194424301453
|
335 |
+
piqa,0,Correct the solution,rouge2_fmeasure,0.15599026594193496
|
336 |
+
piqa,0,choose the most appropriate solution,acc,0.49455930359085964
|
337 |
+
piqa,0,no prompt needed,rouge2_fmeasure,0.005465981531797976
|
338 |
+
piqa,0,pick_correct_choice_index,acc,0.49510337323177367
|
339 |
+
piqa,0,what_is_the_correct_ending,acc,0.559847660500544
|
340 |
+
piqa,0,median,accuracy,0.49510337323177367
|
341 |
+
piqa,1,Correct the solution,rouge2_fmeasure,0.20124233569826713
|
342 |
+
piqa,1,choose the most appropriate solution,acc,0.5021762785636561
|
343 |
+
piqa,1,no prompt needed,rouge2_fmeasure,0.005423476681292554
|
344 |
+
piqa,1,pick_correct_choice_index,acc,0.49782372143634385
|
345 |
+
piqa,1,what_is_the_correct_ending,acc,0.5418933623503809
|
346 |
+
piqa,1,median,accuracy,0.5021762785636561
|
347 |
+
piqa,2,Correct the solution,rouge2_fmeasure,0.3238830207743833
|
348 |
+
piqa,2,choose the most appropriate solution,acc,0.514689880304679
|
349 |
+
piqa,2,no prompt needed,rouge2_fmeasure,0.004712505847751591
|
350 |
+
piqa,2,pick_correct_choice_index,acc,0.49347116430903154
|
351 |
+
piqa,2,what_is_the_correct_ending,acc,0.5321001088139282
|
352 |
+
piqa,2,median,accuracy,0.514689880304679
|
353 |
+
piqa,3,Correct the solution,rouge2_fmeasure,0.4057090348109076
|
354 |
+
piqa,3,choose the most appropriate solution,acc,0.5130576713819369
|
355 |
+
piqa,3,no prompt needed,rouge2_fmeasure,0.00470586424330017
|
356 |
+
piqa,3,pick_correct_choice_index,acc,0.4880304678998912
|
357 |
+
piqa,3,what_is_the_correct_ending,acc,0.5310119695321001
|
358 |
+
piqa,3,median,accuracy,0.5130576713819369
|
359 |
+
piqa,4,Correct the solution,rouge2_fmeasure,0.44072198270594876
|
360 |
+
piqa,4,choose the most appropriate solution,acc,0.5076169749727966
|
361 |
+
piqa,4,no prompt needed,rouge2_fmeasure,0.004310884060921524
|
362 |
+
piqa,4,pick_correct_choice_index,acc,0.5195865070729053
|
363 |
+
piqa,4,what_is_the_correct_ending,acc,0.5413492927094669
|
364 |
+
piqa,4,median,accuracy,0.5195865070729053
|
365 |
+
piqa,5,Correct the solution,rouge2_fmeasure,0.4560672630321141
|
366 |
+
piqa,5,choose the most appropriate solution,acc,0.5087051142546246
|
367 |
+
piqa,5,no prompt needed,rouge2_fmeasure,0.0046971364054093695
|
368 |
+
piqa,5,pick_correct_choice_index,acc,0.5076169749727966
|
369 |
+
piqa,5,what_is_the_correct_ending,acc,0.5386289445048966
|
370 |
+
piqa,5,median,accuracy,0.5087051142546246
|
371 |
+
piqa,5,average,multiple,0.5088864708015959
|
372 |
+
sciq,0,Direct Question,acc,0.876
|
373 |
+
sciq,0,Direct Question (Closed Book),acc,0.623
|
374 |
+
sciq,0,Multiple Choice,acc,0.6
|
375 |
+
sciq,0,Multiple Choice (Closed Book),acc,0.486
|
376 |
+
sciq,0,Multiple Choice Question First,acc,0.627
|
377 |
+
sciq,0,median,accuracy,0.623
|
378 |
+
sciq,1,Direct Question,acc,0.913
|
379 |
+
sciq,1,Direct Question (Closed Book),acc,0.698
|
380 |
+
sciq,1,Multiple Choice,acc,0.585
|
381 |
+
sciq,1,Multiple Choice (Closed Book),acc,0.517
|
382 |
+
sciq,1,Multiple Choice Question First,acc,0.51
|
383 |
+
sciq,1,median,accuracy,0.585
|
384 |
+
sciq,2,Direct Question,acc,0.914
|
385 |
+
sciq,2,Direct Question (Closed Book),acc,0.715
|
386 |
+
sciq,2,Multiple Choice,acc,0.608
|
387 |
+
sciq,2,Multiple Choice (Closed Book),acc,0.51
|
388 |
+
sciq,2,Multiple Choice Question First,acc,0.583
|
389 |
+
sciq,2,median,accuracy,0.608
|
390 |
+
sciq,3,Direct Question,acc,0.92
|
391 |
+
sciq,3,Direct Question (Closed Book),acc,0.71
|
392 |
+
sciq,3,Multiple Choice,acc,0.637
|
393 |
+
sciq,3,Multiple Choice (Closed Book),acc,0.529
|
394 |
+
sciq,3,Multiple Choice Question First,acc,0.595
|
395 |
+
sciq,3,median,accuracy,0.637
|
396 |
+
sciq,4,Direct Question,acc,0.922
|
397 |
+
sciq,4,Direct Question (Closed Book),acc,0.717
|
398 |
+
sciq,4,Multiple Choice,acc,0.62
|
399 |
+
sciq,4,Multiple Choice (Closed Book),acc,0.545
|
400 |
+
sciq,4,Multiple Choice Question First,acc,0.599
|
401 |
+
sciq,4,median,accuracy,0.62
|
402 |
+
sciq,5,Direct Question,acc,0.924
|
403 |
+
sciq,5,Direct Question (Closed Book),acc,0.727
|
404 |
+
sciq,5,Multiple Choice,acc,0.625
|
405 |
+
sciq,5,Multiple Choice (Closed Book),acc,0.547
|
406 |
+
sciq,5,Multiple Choice Question First,acc,0.585
|
407 |
+
sciq,5,median,accuracy,0.625
|
408 |
+
sciq,5,average,multiple,0.6163333333333333
|
409 |
+
story_cloze_2016,0,Answer Given options,acc,0.4778193479422769
|
410 |
+
story_cloze_2016,0,Choose Story Ending,acc,0.4890432923570283
|
411 |
+
story_cloze_2016,0,Novel Correct Ending,acc,0.4751469802244789
|
412 |
+
story_cloze_2016,0,Story Continuation and Options,acc,0.5114911811865313
|
413 |
+
story_cloze_2016,0,median,accuracy,0.4834313201496526
|
414 |
+
story_cloze_2016,1,Answer Given options,acc,0.4585783003741315
|
415 |
+
story_cloze_2016,1,Choose Story Ending,acc,0.46980224478888294
|
416 |
+
story_cloze_2016,1,Novel Correct Ending,acc,0.47995724211651525
|
417 |
+
story_cloze_2016,1,Story Continuation and Options,acc,0.48957776590058794
|
418 |
+
story_cloze_2016,1,median,accuracy,0.4748797434526991
|
419 |
+
story_cloze_2016,2,Answer Given options,acc,0.46980224478888294
|
420 |
+
story_cloze_2016,2,Choose Story Ending,acc,0.4660609299839658
|
421 |
+
story_cloze_2016,2,Novel Correct Ending,acc,0.4730090860502405
|
422 |
+
story_cloze_2016,2,Story Continuation and Options,acc,0.4949225013361839
|
423 |
+
story_cloze_2016,2,median,accuracy,0.47140566541956175
|
424 |
+
story_cloze_2016,3,Answer Given options,acc,0.46125066809192944
|
425 |
+
story_cloze_2016,3,Choose Story Ending,acc,0.47247461250668094
|
426 |
+
story_cloze_2016,3,Novel Correct Ending,acc,0.4655264564404062
|
427 |
+
story_cloze_2016,3,Story Continuation and Options,acc,0.4938535542490647
|
428 |
+
story_cloze_2016,3,median,accuracy,0.46900053447354356
|
429 |
+
story_cloze_2016,4,Answer Given options,acc,0.4436130411544629
|
430 |
+
story_cloze_2016,4,Choose Story Ending,acc,0.46392303580972744
|
431 |
+
story_cloze_2016,4,Novel Correct Ending,acc,0.46125066809192944
|
432 |
+
story_cloze_2016,4,Story Continuation and Options,acc,0.5077498663816141
|
433 |
+
story_cloze_2016,4,median,accuracy,0.46258685195082844
|
434 |
+
story_cloze_2016,5,Answer Given options,acc,0.4462854088722608
|
435 |
+
story_cloze_2016,5,Choose Story Ending,acc,0.4708711918760021
|
436 |
+
story_cloze_2016,5,Novel Correct Ending,acc,0.4462854088722608
|
437 |
+
story_cloze_2016,5,Story Continuation and Options,acc,0.4938535542490647
|
438 |
+
story_cloze_2016,5,median,accuracy,0.4585783003741315
|
439 |
+
story_cloze_2016,5,average,multiple,0.4699804026367362
|
440 |
+
superglue_rte,0,GPT-3 style,acc,0.5234657039711191
|
441 |
+
superglue_rte,0,MNLI crowdsource,acc,0.48014440433212996
|
442 |
+
superglue_rte,0,does it follow that,acc,0.48014440433212996
|
443 |
+
superglue_rte,0,guaranteed true,acc,0.49458483754512633
|
444 |
+
superglue_rte,0,should assume,acc,0.4981949458483754
|
445 |
+
superglue_rte,0,median,accuracy,0.49458483754512633
|
446 |
+
superglue_rte,1,GPT-3 style,acc,0.516245487364621
|
447 |
+
superglue_rte,1,MNLI crowdsource,acc,0.49097472924187724
|
448 |
+
superglue_rte,1,does it follow that,acc,0.49097472924187724
|
449 |
+
superglue_rte,1,guaranteed true,acc,0.49458483754512633
|
450 |
+
superglue_rte,1,should assume,acc,0.49097472924187724
|
451 |
+
superglue_rte,1,median,accuracy,0.49097472924187724
|
452 |
+
superglue_rte,2,GPT-3 style,acc,0.5270758122743683
|
453 |
+
superglue_rte,2,MNLI crowdsource,acc,0.5054151624548736
|
454 |
+
superglue_rte,2,does it follow that,acc,0.5126353790613718
|
455 |
+
superglue_rte,2,guaranteed true,acc,0.5090252707581228
|
456 |
+
superglue_rte,2,should assume,acc,0.5054151624548736
|
457 |
+
superglue_rte,2,median,accuracy,0.5090252707581228
|
458 |
+
superglue_rte,3,GPT-3 style,acc,0.5342960288808665
|
459 |
+
superglue_rte,3,MNLI crowdsource,acc,0.516245487364621
|
460 |
+
superglue_rte,3,does it follow that,acc,0.51985559566787
|
461 |
+
superglue_rte,3,guaranteed true,acc,0.5054151624548736
|
462 |
+
superglue_rte,3,should assume,acc,0.5306859205776173
|
463 |
+
superglue_rte,3,median,accuracy,0.51985559566787
|
464 |
+
superglue_rte,4,GPT-3 style,acc,0.555956678700361
|
465 |
+
superglue_rte,4,MNLI crowdsource,acc,0.5342960288808665
|
466 |
+
superglue_rte,4,does it follow that,acc,0.51985559566787
|
467 |
+
superglue_rte,4,guaranteed true,acc,0.5379061371841155
|
468 |
+
superglue_rte,4,should assume,acc,0.5523465703971119
|
469 |
+
superglue_rte,4,median,accuracy,0.5379061371841155
|
470 |
+
superglue_rte,5,GPT-3 style,acc,0.5667870036101083
|
471 |
+
superglue_rte,5,MNLI crowdsource,acc,0.5415162454873647
|
472 |
+
superglue_rte,5,does it follow that,acc,0.5379061371841155
|
473 |
+
superglue_rte,5,guaranteed true,acc,0.5270758122743683
|
474 |
+
superglue_rte,5,should assume,acc,0.51985559566787
|
475 |
+
superglue_rte,5,median,accuracy,0.5379061371841155
|
476 |
+
superglue_rte,5,average,multiple,0.5150421179302046
|
477 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05308201459552208
|
478 |
+
web_nlg_en,0,explicit-graph-description2,rouge2_fmeasure,0.0031079038623638374
|
479 |
+
web_nlg_en,0,implicit-graph-description,rouge2_fmeasure,0.004093823034187833
|
480 |
+
web_nlg_en,0,non-explicit-description,rouge2_fmeasure,0.0091292497595853
|
481 |
+
web_nlg_en,0,very-explicit-description,rouge2_fmeasure,0.004145988607839967
|
482 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.004145988607839967
|
483 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.054620931903283015
|
484 |
+
web_nlg_en,1,explicit-graph-description2,rouge2_fmeasure,0.10582834218904164
|
485 |
+
web_nlg_en,1,implicit-graph-description,rouge2_fmeasure,0.048933388223398
|
486 |
+
web_nlg_en,1,non-explicit-description,rouge2_fmeasure,0.09345632716516253
|
487 |
+
web_nlg_en,1,very-explicit-description,rouge2_fmeasure,0.03326346351515616
|
488 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.054620931903283015
|
489 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.04972618028817665
|
490 |
+
web_nlg_en,2,explicit-graph-description2,rouge2_fmeasure,0.18715015079380715
|
491 |
+
web_nlg_en,2,implicit-graph-description,rouge2_fmeasure,0.0664535256463577
|
492 |
+
web_nlg_en,2,non-explicit-description,rouge2_fmeasure,0.09362913479276391
|
493 |
+
web_nlg_en,2,very-explicit-description,rouge2_fmeasure,0.05216469490783778
|
494 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.0664535256463577
|
495 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.038282775688304856
|
496 |
+
web_nlg_en,3,explicit-graph-description2,rouge2_fmeasure,0.18921046750479484
|
497 |
+
web_nlg_en,3,implicit-graph-description,rouge2_fmeasure,0.07150806038871549
|
498 |
+
web_nlg_en,3,non-explicit-description,rouge2_fmeasure,0.08163089196343765
|
499 |
+
web_nlg_en,3,very-explicit-description,rouge2_fmeasure,0.06259164725081237
|
500 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.07150806038871549
|
501 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.035274756528572794
|
502 |
+
web_nlg_en,4,explicit-graph-description2,rouge2_fmeasure,0.1826637134726461
|
503 |
+
web_nlg_en,4,implicit-graph-description,rouge2_fmeasure,0.0756101747685517
|
504 |
+
web_nlg_en,4,non-explicit-description,rouge2_fmeasure,0.0843786630967621
|
505 |
+
web_nlg_en,4,very-explicit-description,rouge2_fmeasure,0.07984816777629551
|
506 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.07984816777629551
|
507 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.04337652772461485
|
508 |
+
web_nlg_en,5,explicit-graph-description2,rouge2_fmeasure,0.1794175550575056
|
509 |
+
web_nlg_en,5,implicit-graph-description,rouge2_fmeasure,0.07734137021121697
|
510 |
+
web_nlg_en,5,non-explicit-description,rouge2_fmeasure,0.10182499744322755
|
511 |
+
web_nlg_en,5,very-explicit-description,rouge2_fmeasure,0.08392662509619025
|
512 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.08392662509619025
|
513 |
+
web_nlg_en,5,average,multiple,0.06008388323644699
|
514 |
+
wiki_lingua_en,0,article_summary_en,rouge2_fmeasure,0.046175339585206684
|
515 |
+
wiki_lingua_en,0,rephrase_en,rouge2_fmeasure,0.011691902088528949
|
516 |
+
wiki_lingua_en,0,summarize_above_en,rouge2_fmeasure,0.013469176886654546
|
517 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.033925070200158246
|
518 |
+
wiki_lingua_en,0,write_abstract_en,rouge2_fmeasure,0.0012597496537706083
|
519 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.013469176886654546
|
520 |
+
wiki_lingua_en,1,article_summary_en,rouge2_fmeasure,0.040168134051973815
|
521 |
+
wiki_lingua_en,1,rephrase_en,rouge2_fmeasure,0.022783938421878452
|
522 |
+
wiki_lingua_en,1,summarize_above_en,rouge2_fmeasure,0.029442252351593513
|
523 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.053937307211284244
|
524 |
+
wiki_lingua_en,1,write_abstract_en,rouge2_fmeasure,0.02069013916427929
|
525 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.029442252351593513
|
526 |
+
wiki_lingua_en,2,article_summary_en,rouge2_fmeasure,0.04567156337896173
|
527 |
+
wiki_lingua_en,2,rephrase_en,rouge2_fmeasure,0.040384981669924526
|
528 |
+
wiki_lingua_en,2,summarize_above_en,rouge2_fmeasure,0.042503871652830684
|
529 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05625290668830642
|
530 |
+
wiki_lingua_en,2,write_abstract_en,rouge2_fmeasure,0.020465708060419606
|
531 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.042503871652830684
|
532 |
+
wiki_lingua_en,3,article_summary_en,rouge2_fmeasure,0.03899411935423918
|
533 |
+
wiki_lingua_en,3,rephrase_en,rouge2_fmeasure,0.03839113878587267
|
534 |
+
wiki_lingua_en,3,summarize_above_en,rouge2_fmeasure,0.03902103124926395
|
535 |
+
wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04688317854067561
|
536 |
+
wiki_lingua_en,3,write_abstract_en,rouge2_fmeasure,0.01711652805678942
|
537 |
+
wiki_lingua_en,3,median,rouge2_fmeasure,0.03899411935423918
|
538 |
+
wiki_lingua_en,4,article_summary_en,rouge2_fmeasure,0.012617379158558721
|
539 |
+
wiki_lingua_en,4,rephrase_en,rouge2_fmeasure,0.012281079448761325
|
540 |
+
wiki_lingua_en,4,summarize_above_en,rouge2_fmeasure,0.01084532274916111
|
541 |
+
wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.013810868807903593
|
542 |
+
wiki_lingua_en,4,write_abstract_en,rouge2_fmeasure,0.0045440484711843825
|
543 |
+
wiki_lingua_en,4,median,rouge2_fmeasure,0.012281079448761325
|
544 |
+
wiki_lingua_en,5,article_summary_en,rouge2_fmeasure,0.0019124752375349172
|
545 |
+
wiki_lingua_en,5,rephrase_en,rouge2_fmeasure,0.0017130160893721097
|
546 |
+
wiki_lingua_en,5,summarize_above_en,rouge2_fmeasure,0.0013869187743711499
|
547 |
+
wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0024939038248004536
|
548 |
+
wiki_lingua_en,5,write_abstract_en,rouge2_fmeasure,0.00036639163657650276
|
549 |
+
wiki_lingua_en,5,median,rouge2_fmeasure,0.0017130160893721097
|
550 |
+
wiki_lingua_en,5,average,multiple,0.023067252630575227
|
551 |
+
winogrande,0,Replace,acc,0.5019731649565904
|
552 |
+
winogrande,0,True or False,acc,0.4956590370955012
|
553 |
+
winogrande,0,does underscore refer to,acc,0.4996053670086819
|
554 |
+
winogrande,0,stand for,acc,0.510655090765588
|
555 |
+
winogrande,0,underscore refer to,acc,0.5138121546961326
|
556 |
+
winogrande,0,median,accuracy,0.5019731649565904
|
557 |
+
winogrande,1,Replace,acc,0.5074980268350434
|
558 |
+
winogrande,1,True or False,acc,0.48855564325177586
|
559 |
+
winogrande,1,does underscore refer to,acc,0.4956590370955012
|
560 |
+
winogrande,1,stand for,acc,0.5035516969218626
|
561 |
+
winogrande,1,underscore refer to,acc,0.4972375690607735
|
562 |
+
winogrande,1,median,accuracy,0.4972375690607735
|
563 |
+
winogrande,2,Replace,acc,0.5090765588003157
|
564 |
+
winogrande,2,True or False,acc,0.4940805051302289
|
565 |
+
winogrande,2,does underscore refer to,acc,0.5011838989739542
|
566 |
+
winogrande,2,stand for,acc,0.4980268350434096
|
567 |
+
winogrande,2,underscore refer to,acc,0.5082872928176796
|
568 |
+
winogrande,2,median,accuracy,0.5011838989739542
|
569 |
+
winogrande,3,Replace,acc,0.5217048145224941
|
570 |
+
winogrande,3,True or False,acc,0.4996053670086819
|
571 |
+
winogrande,3,does underscore refer to,acc,0.5153906866614049
|
572 |
+
winogrande,3,stand for,acc,0.5035516969218626
|
573 |
+
winogrande,3,underscore refer to,acc,0.505130228887135
|
574 |
+
winogrande,3,median,accuracy,0.505130228887135
|
575 |
+
winogrande,4,Replace,acc,0.5224940805051302
|
576 |
+
winogrande,4,True or False,acc,0.5027624309392266
|
577 |
+
winogrande,4,does underscore refer to,acc,0.5098658247829518
|
578 |
+
winogrande,4,stand for,acc,0.5082872928176796
|
579 |
+
winogrande,4,underscore refer to,acc,0.5043409629044988
|
580 |
+
winogrande,4,median,accuracy,0.5082872928176796
|
581 |
+
winogrande,5,Replace,acc,0.5122336227308603
|
582 |
+
winogrande,5,True or False,acc,0.5035516969218626
|
583 |
+
winogrande,5,does underscore refer to,acc,0.5074980268350434
|
584 |
+
winogrande,5,stand for,acc,0.48382004735595896
|
585 |
+
winogrande,5,underscore refer to,acc,0.5098658247829518
|
586 |
+
winogrande,5,median,accuracy,0.5074980268350434
|
587 |
+
winogrande,5,average,multiple,0.5035516969218626
|
4b284b17bc4/eval/merged.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
4b284b21bc4/eval/merged.csv
ADDED
@@ -0,0 +1,587 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
anli_r1,0,GPT-3 style,acc,0.331
|
3 |
+
anli_r1,0,MNLI crowdsource,acc,0.333
|
4 |
+
anli_r1,0,can we infer,acc,0.358
|
5 |
+
anli_r1,0,guaranteed/possible/impossible,acc,0.327
|
6 |
+
anli_r1,0,justified in saying,acc,0.356
|
7 |
+
anli_r1,0,median,accuracy,0.333
|
8 |
+
anli_r1,1,GPT-3 style,acc,0.327
|
9 |
+
anli_r1,1,MNLI crowdsource,acc,0.333
|
10 |
+
anli_r1,1,can we infer,acc,0.333
|
11 |
+
anli_r1,1,guaranteed/possible/impossible,acc,0.332
|
12 |
+
anli_r1,1,justified in saying,acc,0.333
|
13 |
+
anli_r1,1,median,accuracy,0.333
|
14 |
+
anli_r1,2,GPT-3 style,acc,0.335
|
15 |
+
anli_r1,2,MNLI crowdsource,acc,0.358
|
16 |
+
anli_r1,2,can we infer,acc,0.361
|
17 |
+
anli_r1,2,guaranteed/possible/impossible,acc,0.329
|
18 |
+
anli_r1,2,justified in saying,acc,0.358
|
19 |
+
anli_r1,2,median,accuracy,0.358
|
20 |
+
anli_r1,3,GPT-3 style,acc,0.347
|
21 |
+
anli_r1,3,MNLI crowdsource,acc,0.358
|
22 |
+
anli_r1,3,can we infer,acc,0.35
|
23 |
+
anli_r1,3,guaranteed/possible/impossible,acc,0.328
|
24 |
+
anli_r1,3,justified in saying,acc,0.355
|
25 |
+
anli_r1,3,median,accuracy,0.35
|
26 |
+
anli_r1,4,GPT-3 style,acc,0.329
|
27 |
+
anli_r1,4,MNLI crowdsource,acc,0.354
|
28 |
+
anli_r1,4,can we infer,acc,0.344
|
29 |
+
anli_r1,4,guaranteed/possible/impossible,acc,0.328
|
30 |
+
anli_r1,4,justified in saying,acc,0.336
|
31 |
+
anli_r1,4,median,accuracy,0.336
|
32 |
+
anli_r1,5,GPT-3 style,acc,0.339
|
33 |
+
anli_r1,5,MNLI crowdsource,acc,0.345
|
34 |
+
anli_r1,5,can we infer,acc,0.329
|
35 |
+
anli_r1,5,guaranteed/possible/impossible,acc,0.331
|
36 |
+
anli_r1,5,justified in saying,acc,0.337
|
37 |
+
anli_r1,5,median,accuracy,0.337
|
38 |
+
anli_r1,5,average,multiple,0.3411666666666667
|
39 |
+
anli_r2,0,GPT-3 style,acc,0.334
|
40 |
+
anli_r2,0,MNLI crowdsource,acc,0.333
|
41 |
+
anli_r2,0,can we infer,acc,0.35
|
42 |
+
anli_r2,0,guaranteed/possible/impossible,acc,0.34
|
43 |
+
anli_r2,0,justified in saying,acc,0.339
|
44 |
+
anli_r2,0,median,accuracy,0.339
|
45 |
+
anli_r2,1,GPT-3 style,acc,0.313
|
46 |
+
anli_r2,1,MNLI crowdsource,acc,0.315
|
47 |
+
anli_r2,1,can we infer,acc,0.315
|
48 |
+
anli_r2,1,guaranteed/possible/impossible,acc,0.314
|
49 |
+
anli_r2,1,justified in saying,acc,0.315
|
50 |
+
anli_r2,1,median,accuracy,0.315
|
51 |
+
anli_r2,2,GPT-3 style,acc,0.333
|
52 |
+
anli_r2,2,MNLI crowdsource,acc,0.329
|
53 |
+
anli_r2,2,can we infer,acc,0.323
|
54 |
+
anli_r2,2,guaranteed/possible/impossible,acc,0.31
|
55 |
+
anli_r2,2,justified in saying,acc,0.323
|
56 |
+
anli_r2,2,median,accuracy,0.323
|
57 |
+
anli_r2,3,GPT-3 style,acc,0.337
|
58 |
+
anli_r2,3,MNLI crowdsource,acc,0.317
|
59 |
+
anli_r2,3,can we infer,acc,0.338
|
60 |
+
anli_r2,3,guaranteed/possible/impossible,acc,0.32
|
61 |
+
anli_r2,3,justified in saying,acc,0.329
|
62 |
+
anli_r2,3,median,accuracy,0.329
|
63 |
+
anli_r2,4,GPT-3 style,acc,0.336
|
64 |
+
anli_r2,4,MNLI crowdsource,acc,0.314
|
65 |
+
anli_r2,4,can we infer,acc,0.334
|
66 |
+
anli_r2,4,guaranteed/possible/impossible,acc,0.326
|
67 |
+
anli_r2,4,justified in saying,acc,0.329
|
68 |
+
anli_r2,4,median,accuracy,0.329
|
69 |
+
anli_r2,5,GPT-3 style,acc,0.342
|
70 |
+
anli_r2,5,MNLI crowdsource,acc,0.304
|
71 |
+
anli_r2,5,can we infer,acc,0.324
|
72 |
+
anli_r2,5,guaranteed/possible/impossible,acc,0.332
|
73 |
+
anli_r2,5,justified in saying,acc,0.317
|
74 |
+
anli_r2,5,median,accuracy,0.324
|
75 |
+
anli_r2,5,average,multiple,0.3265
|
76 |
+
anli_r3,0,GPT-3 style,acc,0.335
|
77 |
+
anli_r3,0,MNLI crowdsource,acc,0.3358333333333333
|
78 |
+
anli_r3,0,can we infer,acc,0.3333333333333333
|
79 |
+
anli_r3,0,guaranteed/possible/impossible,acc,0.32083333333333336
|
80 |
+
anli_r3,0,justified in saying,acc,0.3416666666666667
|
81 |
+
anli_r3,0,median,accuracy,0.335
|
82 |
+
anli_r3,1,GPT-3 style,acc,0.335
|
83 |
+
anli_r3,1,MNLI crowdsource,acc,0.33666666666666667
|
84 |
+
anli_r3,1,can we infer,acc,0.33666666666666667
|
85 |
+
anli_r3,1,guaranteed/possible/impossible,acc,0.3358333333333333
|
86 |
+
anli_r3,1,justified in saying,acc,0.33666666666666667
|
87 |
+
anli_r3,1,median,accuracy,0.33666666666666667
|
88 |
+
anli_r3,2,GPT-3 style,acc,0.33
|
89 |
+
anli_r3,2,MNLI crowdsource,acc,0.3233333333333333
|
90 |
+
anli_r3,2,can we infer,acc,0.325
|
91 |
+
anli_r3,2,guaranteed/possible/impossible,acc,0.32
|
92 |
+
anli_r3,2,justified in saying,acc,0.325
|
93 |
+
anli_r3,2,median,accuracy,0.325
|
94 |
+
anli_r3,3,GPT-3 style,acc,0.33166666666666667
|
95 |
+
anli_r3,3,MNLI crowdsource,acc,0.31833333333333336
|
96 |
+
anli_r3,3,can we infer,acc,0.3325
|
97 |
+
anli_r3,3,guaranteed/possible/impossible,acc,0.3308333333333333
|
98 |
+
anli_r3,3,justified in saying,acc,0.3408333333333333
|
99 |
+
anli_r3,3,median,accuracy,0.33166666666666667
|
100 |
+
anli_r3,4,GPT-3 style,acc,0.32666666666666666
|
101 |
+
anli_r3,4,MNLI crowdsource,acc,0.31583333333333335
|
102 |
+
anli_r3,4,can we infer,acc,0.31583333333333335
|
103 |
+
anli_r3,4,guaranteed/possible/impossible,acc,0.33666666666666667
|
104 |
+
anli_r3,4,justified in saying,acc,0.3175
|
105 |
+
anli_r3,4,median,accuracy,0.3175
|
106 |
+
anli_r3,5,GPT-3 style,acc,0.31916666666666665
|
107 |
+
anli_r3,5,MNLI crowdsource,acc,0.31
|
108 |
+
anli_r3,5,can we infer,acc,0.31166666666666665
|
109 |
+
anli_r3,5,guaranteed/possible/impossible,acc,0.3383333333333333
|
110 |
+
anli_r3,5,justified in saying,acc,0.30833333333333335
|
111 |
+
anli_r3,5,median,accuracy,0.31166666666666665
|
112 |
+
anli_r3,5,average,multiple,0.32625
|
113 |
+
arc_easy,0,heres_a_problem,acc,0.25
|
114 |
+
arc_easy,0,i_am_hesitating,acc,0.35395622895622897
|
115 |
+
arc_easy,0,multiple_choice,acc,0.23378839590443687
|
116 |
+
arc_easy,0,pick_the_most_correct_option,acc,0.24705387205387205
|
117 |
+
arc_easy,0,qa_options,acc,0.26023890784982934
|
118 |
+
arc_easy,0,median,accuracy,0.25
|
119 |
+
arc_easy,1,heres_a_problem,acc,0.24368686868686867
|
120 |
+
arc_easy,1,i_am_hesitating,acc,0.3468013468013468
|
121 |
+
arc_easy,1,multiple_choice,acc,0.3253367003367003
|
122 |
+
arc_easy,1,pick_the_most_correct_option,acc,0.2295221843003413
|
123 |
+
arc_easy,1,qa_options,acc,0.3425925925925926
|
124 |
+
arc_easy,1,median,accuracy,0.3253367003367003
|
125 |
+
arc_easy,2,heres_a_problem,acc,0.2508532423208191
|
126 |
+
arc_easy,2,i_am_hesitating,acc,0.3383838383838384
|
127 |
+
arc_easy,2,multiple_choice,acc,0.351010101010101
|
128 |
+
arc_easy,2,pick_the_most_correct_option,acc,0.24829351535836178
|
129 |
+
arc_easy,2,qa_options,acc,0.335016835016835
|
130 |
+
arc_easy,2,median,accuracy,0.335016835016835
|
131 |
+
arc_easy,3,heres_a_problem,acc,0.24915824915824916
|
132 |
+
arc_easy,3,i_am_hesitating,acc,0.25170648464163825
|
133 |
+
arc_easy,3,multiple_choice,acc,0.2380546075085324
|
134 |
+
arc_easy,3,pick_the_most_correct_option,acc,0.25170648464163825
|
135 |
+
arc_easy,3,qa_options,acc,0.3400673400673401
|
136 |
+
arc_easy,3,median,accuracy,0.25170648464163825
|
137 |
+
arc_easy,4,heres_a_problem,acc,0.24284511784511784
|
138 |
+
arc_easy,4,i_am_hesitating,acc,0.3480639730639731
|
139 |
+
arc_easy,4,multiple_choice,acc,0.24146757679180889
|
140 |
+
arc_easy,4,pick_the_most_correct_option,acc,0.24284511784511784
|
141 |
+
arc_easy,4,qa_options,acc,0.3367003367003367
|
142 |
+
arc_easy,4,median,accuracy,0.24284511784511784
|
143 |
+
arc_easy,5,heres_a_problem,acc,0.2431740614334471
|
144 |
+
arc_easy,5,i_am_hesitating,acc,0.33880471380471383
|
145 |
+
arc_easy,5,multiple_choice,acc,0.33796296296296297
|
146 |
+
arc_easy,5,pick_the_most_correct_option,acc,0.25
|
147 |
+
arc_easy,5,qa_options,acc,0.25170648464163825
|
148 |
+
arc_easy,5,median,accuracy,0.25170648464163825
|
149 |
+
arc_easy,5,average,multiple,0.2761019370803216
|
150 |
+
boolq,0,GPT-3 Style,acc,0.5143333333333333
|
151 |
+
boolq,0,after_reading,acc,0.6233333333333333
|
152 |
+
boolq,0,exercise,acc,0.6236666666666667
|
153 |
+
boolq,0,valid_binary,acc,0.5753333333333334
|
154 |
+
boolq,0,yes_no_question,acc,0.5276666666666666
|
155 |
+
boolq,0,median,accuracy,0.5753333333333334
|
156 |
+
boolq,1,GPT-3 Style,acc,0.493
|
157 |
+
boolq,1,after_reading,acc,0.546
|
158 |
+
boolq,1,exercise,acc,0.6096666666666667
|
159 |
+
boolq,1,valid_binary,acc,0.5676666666666667
|
160 |
+
boolq,1,yes_no_question,acc,0.5406666666666666
|
161 |
+
boolq,1,median,accuracy,0.546
|
162 |
+
boolq,2,GPT-3 Style,acc,0.5063333333333333
|
163 |
+
boolq,2,after_reading,acc,0.5836666666666667
|
164 |
+
boolq,2,exercise,acc,0.6033333333333334
|
165 |
+
boolq,2,valid_binary,acc,0.593
|
166 |
+
boolq,2,yes_no_question,acc,0.5303333333333333
|
167 |
+
boolq,2,median,accuracy,0.5836666666666667
|
168 |
+
boolq,3,GPT-3 Style,acc,0.528
|
169 |
+
boolq,3,after_reading,acc,0.6116666666666667
|
170 |
+
boolq,3,exercise,acc,0.6083333333333333
|
171 |
+
boolq,3,valid_binary,acc,0.6066666666666667
|
172 |
+
boolq,3,yes_no_question,acc,0.5283333333333333
|
173 |
+
boolq,3,median,accuracy,0.6066666666666667
|
174 |
+
boolq,4,GPT-3 Style,acc,0.531
|
175 |
+
boolq,4,after_reading,acc,0.6136666666666667
|
176 |
+
boolq,4,exercise,acc,0.6133333333333333
|
177 |
+
boolq,4,valid_binary,acc,0.614
|
178 |
+
boolq,4,yes_no_question,acc,0.5186666666666667
|
179 |
+
boolq,4,median,accuracy,0.6133333333333333
|
180 |
+
boolq,5,GPT-3 Style,acc,0.5486666666666666
|
181 |
+
boolq,5,after_reading,acc,0.6126666666666667
|
182 |
+
boolq,5,exercise,acc,0.6183333333333333
|
183 |
+
boolq,5,valid_binary,acc,0.6123333333333333
|
184 |
+
boolq,5,yes_no_question,acc,0.5196666666666667
|
185 |
+
boolq,5,median,accuracy,0.6123333333333333
|
186 |
+
boolq,5,average,multiple,0.5895555555555556
|
187 |
+
cb,0,GPT-3 style,acc,0.375
|
188 |
+
cb,0,MNLI crowdsource,acc,0.4107142857142857
|
189 |
+
cb,0,can we infer,acc,0.5357142857142857
|
190 |
+
cb,0,guaranteed/possible/impossible,acc,0.10714285714285714
|
191 |
+
cb,0,justified in saying,acc,0.5178571428571429
|
192 |
+
cb,0,median,accuracy,0.4107142857142857
|
193 |
+
cb,1,GPT-3 style,acc,0.375
|
194 |
+
cb,1,MNLI crowdsource,acc,0.39285714285714285
|
195 |
+
cb,1,can we infer,acc,0.39285714285714285
|
196 |
+
cb,1,guaranteed/possible/impossible,acc,0.375
|
197 |
+
cb,1,justified in saying,acc,0.39285714285714285
|
198 |
+
cb,1,median,accuracy,0.39285714285714285
|
199 |
+
cb,2,GPT-3 style,acc,0.35714285714285715
|
200 |
+
cb,2,MNLI crowdsource,acc,0.4642857142857143
|
201 |
+
cb,2,can we infer,acc,0.39285714285714285
|
202 |
+
cb,2,guaranteed/possible/impossible,acc,0.25
|
203 |
+
cb,2,justified in saying,acc,0.39285714285714285
|
204 |
+
cb,2,median,accuracy,0.39285714285714285
|
205 |
+
cb,3,GPT-3 style,acc,0.3392857142857143
|
206 |
+
cb,3,MNLI crowdsource,acc,0.4107142857142857
|
207 |
+
cb,3,can we infer,acc,0.39285714285714285
|
208 |
+
cb,3,guaranteed/possible/impossible,acc,0.14285714285714285
|
209 |
+
cb,3,justified in saying,acc,0.375
|
210 |
+
cb,3,median,accuracy,0.375
|
211 |
+
cb,4,GPT-3 style,acc,0.32142857142857145
|
212 |
+
cb,4,MNLI crowdsource,acc,0.42857142857142855
|
213 |
+
cb,4,can we infer,acc,0.44642857142857145
|
214 |
+
cb,4,guaranteed/possible/impossible,acc,0.10714285714285714
|
215 |
+
cb,4,justified in saying,acc,0.44642857142857145
|
216 |
+
cb,4,median,accuracy,0.42857142857142855
|
217 |
+
cb,5,GPT-3 style,acc,0.2857142857142857
|
218 |
+
cb,5,MNLI crowdsource,acc,0.4107142857142857
|
219 |
+
cb,5,can we infer,acc,0.44642857142857145
|
220 |
+
cb,5,guaranteed/possible/impossible,acc,0.14285714285714285
|
221 |
+
cb,5,justified in saying,acc,0.44642857142857145
|
222 |
+
cb,5,median,accuracy,0.4107142857142857
|
223 |
+
cb,5,average,multiple,0.4017857142857143
|
224 |
+
copa,0,best_option,acc,0.6
|
225 |
+
copa,0,cause_effect,acc,0.54
|
226 |
+
copa,0,choose,acc,0.58
|
227 |
+
copa,0,i_am_hesitating,acc,0.54
|
228 |
+
copa,0,plausible_alternatives,acc,0.54
|
229 |
+
copa,0,median,accuracy,0.54
|
230 |
+
copa,1,best_option,acc,0.53
|
231 |
+
copa,1,cause_effect,acc,0.42
|
232 |
+
copa,1,choose,acc,0.44
|
233 |
+
copa,1,i_am_hesitating,acc,0.43
|
234 |
+
copa,1,plausible_alternatives,acc,0.45
|
235 |
+
copa,1,median,accuracy,0.44
|
236 |
+
copa,2,best_option,acc,0.63
|
237 |
+
copa,2,cause_effect,acc,0.44
|
238 |
+
copa,2,choose,acc,0.4
|
239 |
+
copa,2,i_am_hesitating,acc,0.41
|
240 |
+
copa,2,plausible_alternatives,acc,0.42
|
241 |
+
copa,2,median,accuracy,0.42
|
242 |
+
copa,3,best_option,acc,0.6
|
243 |
+
copa,3,cause_effect,acc,0.44
|
244 |
+
copa,3,choose,acc,0.39
|
245 |
+
copa,3,i_am_hesitating,acc,0.44
|
246 |
+
copa,3,plausible_alternatives,acc,0.44
|
247 |
+
copa,3,median,accuracy,0.44
|
248 |
+
copa,4,best_option,acc,0.62
|
249 |
+
copa,4,cause_effect,acc,0.45
|
250 |
+
copa,4,choose,acc,0.41
|
251 |
+
copa,4,i_am_hesitating,acc,0.43
|
252 |
+
copa,4,plausible_alternatives,acc,0.44
|
253 |
+
copa,4,median,accuracy,0.44
|
254 |
+
copa,5,best_option,acc,0.58
|
255 |
+
copa,5,cause_effect,acc,0.47
|
256 |
+
copa,5,choose,acc,0.44
|
257 |
+
copa,5,i_am_hesitating,acc,0.48
|
258 |
+
copa,5,plausible_alternatives,acc,0.46
|
259 |
+
copa,5,median,accuracy,0.47
|
260 |
+
copa,5,average,multiple,0.4583333333333333
|
261 |
+
e2e_nlg_cleaned,0,coherent_text,rouge2_fmeasure,0.06946399430025461
|
262 |
+
e2e_nlg_cleaned,0,create_text_for_me,rouge2_fmeasure,0.022387020564367744
|
263 |
+
e2e_nlg_cleaned,0,generate_gramatically_correct_text,rouge2_fmeasure,0.0
|
264 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.06289750165250287
|
265 |
+
e2e_nlg_cleaned,0,text,rouge2_fmeasure,0.05547805696954945
|
266 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.05547805696954945
|
267 |
+
e2e_nlg_cleaned,1,coherent_text,rouge2_fmeasure,0.1605728054943973
|
268 |
+
e2e_nlg_cleaned,1,create_text_for_me,rouge2_fmeasure,0.15944824307809596
|
269 |
+
e2e_nlg_cleaned,1,generate_gramatically_correct_text,rouge2_fmeasure,0.028616212681722628
|
270 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.12449829406834531
|
271 |
+
e2e_nlg_cleaned,1,text,rouge2_fmeasure,0.19999728868621525
|
272 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.15944824307809596
|
273 |
+
e2e_nlg_cleaned,2,coherent_text,rouge2_fmeasure,0.177921888015793
|
274 |
+
e2e_nlg_cleaned,2,create_text_for_me,rouge2_fmeasure,0.17045094780052067
|
275 |
+
e2e_nlg_cleaned,2,generate_gramatically_correct_text,rouge2_fmeasure,0.07112804192230661
|
276 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.14591205568832014
|
277 |
+
e2e_nlg_cleaned,2,text,rouge2_fmeasure,0.19727207437417654
|
278 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.17045094780052067
|
279 |
+
e2e_nlg_cleaned,3,coherent_text,rouge2_fmeasure,0.1797184980766115
|
280 |
+
e2e_nlg_cleaned,3,create_text_for_me,rouge2_fmeasure,0.1725725568885113
|
281 |
+
e2e_nlg_cleaned,3,generate_gramatically_correct_text,rouge2_fmeasure,0.10309288089148716
|
282 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.14908085018598377
|
283 |
+
e2e_nlg_cleaned,3,text,rouge2_fmeasure,0.1964238350803286
|
284 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.1725725568885113
|
285 |
+
e2e_nlg_cleaned,4,coherent_text,rouge2_fmeasure,0.17842808573274627
|
286 |
+
e2e_nlg_cleaned,4,create_text_for_me,rouge2_fmeasure,0.1732811817482548
|
287 |
+
e2e_nlg_cleaned,4,generate_gramatically_correct_text,rouge2_fmeasure,0.1257055751954671
|
288 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.15250785714191883
|
289 |
+
e2e_nlg_cleaned,4,text,rouge2_fmeasure,0.19301601907405783
|
290 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.1732811817482548
|
291 |
+
e2e_nlg_cleaned,5,coherent_text,rouge2_fmeasure,0.17801029501851723
|
292 |
+
e2e_nlg_cleaned,5,create_text_for_me,rouge2_fmeasure,0.17347865411768018
|
293 |
+
e2e_nlg_cleaned,5,generate_gramatically_correct_text,rouge2_fmeasure,0.14176701175164574
|
294 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.15567663749325128
|
295 |
+
e2e_nlg_cleaned,5,text,rouge2_fmeasure,0.1925812091012645
|
296 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.17347865411768018
|
297 |
+
e2e_nlg_cleaned,5,average,multiple,0.1507849401004354
|
298 |
+
gem_xsum,0,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.021404604329843858
|
299 |
+
gem_xsum,0,DOC_tldr,rouge2_fmeasure,0.04267694660641669
|
300 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04639360161894793
|
301 |
+
gem_xsum,0,summarize_DOC,rouge2_fmeasure,0.037047223818116884
|
302 |
+
gem_xsum,0,summarize_this_DOC_summary,rouge2_fmeasure,0.048478969239621084
|
303 |
+
gem_xsum,0,median,rouge2_fmeasure,0.04267694660641669
|
304 |
+
gem_xsum,1,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.019363039161219217
|
305 |
+
gem_xsum,1,DOC_tldr,rouge2_fmeasure,0.046583919666909064
|
306 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.041352212313184845
|
307 |
+
gem_xsum,1,summarize_DOC,rouge2_fmeasure,0.04746052242779793
|
308 |
+
gem_xsum,1,summarize_this_DOC_summary,rouge2_fmeasure,0.03724797091171915
|
309 |
+
gem_xsum,1,median,rouge2_fmeasure,0.041352212313184845
|
310 |
+
gem_xsum,2,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.024834458548132245
|
311 |
+
gem_xsum,2,DOC_tldr,rouge2_fmeasure,0.05089418045053158
|
312 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04614704751240165
|
313 |
+
gem_xsum,2,summarize_DOC,rouge2_fmeasure,0.05137980859666271
|
314 |
+
gem_xsum,2,summarize_this_DOC_summary,rouge2_fmeasure,0.04154990869291492
|
315 |
+
gem_xsum,2,median,rouge2_fmeasure,0.04614704751240165
|
316 |
+
gem_xsum,3,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0308541164205269
|
317 |
+
gem_xsum,3,DOC_tldr,rouge2_fmeasure,0.05035265573315919
|
318 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04492583126972709
|
319 |
+
gem_xsum,3,summarize_DOC,rouge2_fmeasure,0.05134777699396957
|
320 |
+
gem_xsum,3,summarize_this_DOC_summary,rouge2_fmeasure,0.040167379424322004
|
321 |
+
gem_xsum,3,median,rouge2_fmeasure,0.04492583126972709
|
322 |
+
gem_xsum,4,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.008953373288301222
|
323 |
+
gem_xsum,4,DOC_tldr,rouge2_fmeasure,0.013792609834707413
|
324 |
+
gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01203821663060757
|
325 |
+
gem_xsum,4,summarize_DOC,rouge2_fmeasure,0.012186294567403028
|
326 |
+
gem_xsum,4,summarize_this_DOC_summary,rouge2_fmeasure,0.010750703825901103
|
327 |
+
gem_xsum,4,median,rouge2_fmeasure,0.01203821663060757
|
328 |
+
gem_xsum,5,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0
|
329 |
+
gem_xsum,5,DOC_tldr,rouge2_fmeasure,0.0004211068403029026
|
330 |
+
gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0003256051958251534
|
331 |
+
gem_xsum,5,summarize_DOC,rouge2_fmeasure,0.00026998206537521717
|
332 |
+
gem_xsum,5,summarize_this_DOC_summary,rouge2_fmeasure,0.0
|
333 |
+
gem_xsum,5,median,rouge2_fmeasure,0.00026998206537521717
|
334 |
+
gem_xsum,5,average,multiple,0.031235039399618844
|
335 |
+
piqa,0,Correct the solution,rouge2_fmeasure,0.20168519353681674
|
336 |
+
piqa,0,choose the most appropriate solution,acc,0.49510337323177367
|
337 |
+
piqa,0,no prompt needed,rouge2_fmeasure,0.005612668710216912
|
338 |
+
piqa,0,pick_correct_choice_index,acc,0.49510337323177367
|
339 |
+
piqa,0,what_is_the_correct_ending,acc,0.5609357997823722
|
340 |
+
piqa,0,median,accuracy,0.49510337323177367
|
341 |
+
piqa,1,Correct the solution,rouge2_fmeasure,0.16670313208229318
|
342 |
+
piqa,1,choose the most appropriate solution,acc,0.49510337323177367
|
343 |
+
piqa,1,no prompt needed,rouge2_fmeasure,0.005467793325653137
|
344 |
+
piqa,1,pick_correct_choice_index,acc,0.4967355821545158
|
345 |
+
piqa,1,what_is_the_correct_ending,acc,0.5680087051142546
|
346 |
+
piqa,1,median,accuracy,0.4967355821545158
|
347 |
+
piqa,2,Correct the solution,rouge2_fmeasure,0.1635128839739126
|
348 |
+
piqa,2,choose the most appropriate solution,acc,0.5108813928182807
|
349 |
+
piqa,2,no prompt needed,rouge2_fmeasure,0.004589484275527073
|
350 |
+
piqa,2,pick_correct_choice_index,acc,0.4836779107725789
|
351 |
+
piqa,2,what_is_the_correct_ending,acc,0.5516866158868335
|
352 |
+
piqa,2,median,accuracy,0.5108813928182807
|
353 |
+
piqa,3,Correct the solution,rouge2_fmeasure,0.16228271943343794
|
354 |
+
piqa,3,choose the most appropriate solution,acc,0.5016322089227421
|
355 |
+
piqa,3,no prompt needed,rouge2_fmeasure,0.004446131485933507
|
356 |
+
piqa,3,pick_correct_choice_index,acc,0.4766050054406964
|
357 |
+
piqa,3,what_is_the_correct_ending,acc,0.5625680087051143
|
358 |
+
piqa,3,median,accuracy,0.5016322089227421
|
359 |
+
piqa,4,Correct the solution,rouge2_fmeasure,0.17392590467314775
|
360 |
+
piqa,4,choose the most appropriate solution,acc,0.5021762785636561
|
361 |
+
piqa,4,no prompt needed,rouge2_fmeasure,0.004018163648220329
|
362 |
+
piqa,4,pick_correct_choice_index,acc,0.4896626768226333
|
363 |
+
piqa,4,what_is_the_correct_ending,acc,0.5489662676822633
|
364 |
+
piqa,4,median,accuracy,0.5021762785636561
|
365 |
+
piqa,5,Correct the solution,rouge2_fmeasure,0.18898088840112187
|
366 |
+
piqa,5,choose the most appropriate solution,acc,0.499455930359086
|
367 |
+
piqa,5,no prompt needed,rouge2_fmeasure,0.0037514567534632703
|
368 |
+
piqa,5,pick_correct_choice_index,acc,0.4885745375408052
|
369 |
+
piqa,5,what_is_the_correct_ending,acc,0.5576713819368879
|
370 |
+
piqa,5,median,accuracy,0.499455930359086
|
371 |
+
piqa,5,average,multiple,0.5009974610083424
|
372 |
+
sciq,0,Direct Question,acc,0.862
|
373 |
+
sciq,0,Direct Question (Closed Book),acc,0.498
|
374 |
+
sciq,0,Multiple Choice,acc,0.569
|
375 |
+
sciq,0,Multiple Choice (Closed Book),acc,0.422
|
376 |
+
sciq,0,Multiple Choice Question First,acc,0.571
|
377 |
+
sciq,0,median,accuracy,0.569
|
378 |
+
sciq,1,Direct Question,acc,0.896
|
379 |
+
sciq,1,Direct Question (Closed Book),acc,0.65
|
380 |
+
sciq,1,Multiple Choice,acc,0.55
|
381 |
+
sciq,1,Multiple Choice (Closed Book),acc,0.43
|
382 |
+
sciq,1,Multiple Choice Question First,acc,0.427
|
383 |
+
sciq,1,median,accuracy,0.55
|
384 |
+
sciq,2,Direct Question,acc,0.917
|
385 |
+
sciq,2,Direct Question (Closed Book),acc,0.664
|
386 |
+
sciq,2,Multiple Choice,acc,0.565
|
387 |
+
sciq,2,Multiple Choice (Closed Book),acc,0.441
|
388 |
+
sciq,2,Multiple Choice Question First,acc,0.431
|
389 |
+
sciq,2,median,accuracy,0.565
|
390 |
+
sciq,3,Direct Question,acc,0.921
|
391 |
+
sciq,3,Direct Question (Closed Book),acc,0.681
|
392 |
+
sciq,3,Multiple Choice,acc,0.571
|
393 |
+
sciq,3,Multiple Choice (Closed Book),acc,0.481
|
394 |
+
sciq,3,Multiple Choice Question First,acc,0.441
|
395 |
+
sciq,3,median,accuracy,0.571
|
396 |
+
sciq,4,Direct Question,acc,0.918
|
397 |
+
sciq,4,Direct Question (Closed Book),acc,0.686
|
398 |
+
sciq,4,Multiple Choice,acc,0.588
|
399 |
+
sciq,4,Multiple Choice (Closed Book),acc,0.501
|
400 |
+
sciq,4,Multiple Choice Question First,acc,0.448
|
401 |
+
sciq,4,median,accuracy,0.588
|
402 |
+
sciq,5,Direct Question,acc,0.923
|
403 |
+
sciq,5,Direct Question (Closed Book),acc,0.708
|
404 |
+
sciq,5,Multiple Choice,acc,0.599
|
405 |
+
sciq,5,Multiple Choice (Closed Book),acc,0.524
|
406 |
+
sciq,5,Multiple Choice Question First,acc,0.451
|
407 |
+
sciq,5,median,accuracy,0.599
|
408 |
+
sciq,5,average,multiple,0.5736666666666667
|
409 |
+
story_cloze_2016,0,Answer Given options,acc,0.4719401389631213
|
410 |
+
story_cloze_2016,0,Choose Story Ending,acc,0.484233030464992
|
411 |
+
story_cloze_2016,0,Novel Correct Ending,acc,0.48583645109567075
|
412 |
+
story_cloze_2016,0,Story Continuation and Options,acc,0.4804917156600748
|
413 |
+
story_cloze_2016,0,median,accuracy,0.4823623730625334
|
414 |
+
story_cloze_2016,1,Answer Given options,acc,0.4730090860502405
|
415 |
+
story_cloze_2016,1,Choose Story Ending,acc,0.4794227685729556
|
416 |
+
story_cloze_2016,1,Novel Correct Ending,acc,0.47835382148583644
|
417 |
+
story_cloze_2016,1,Story Continuation and Options,acc,0.4681988241582042
|
418 |
+
story_cloze_2016,1,median,accuracy,0.4756814537680385
|
419 |
+
story_cloze_2016,2,Answer Given options,acc,0.46018172100481025
|
420 |
+
story_cloze_2016,2,Choose Story Ending,acc,0.4596472474612507
|
421 |
+
story_cloze_2016,2,Novel Correct Ending,acc,0.47140566541956175
|
422 |
+
story_cloze_2016,2,Story Continuation and Options,acc,0.4494922501336184
|
423 |
+
story_cloze_2016,2,median,accuracy,0.4599144842330305
|
424 |
+
story_cloze_2016,3,Answer Given options,acc,0.46178514163548906
|
425 |
+
story_cloze_2016,3,Choose Story Ending,acc,0.46873329770176375
|
426 |
+
story_cloze_2016,3,Novel Correct Ending,acc,0.4607161945483699
|
427 |
+
story_cloze_2016,3,Story Continuation and Options,acc,0.4580438268305719
|
428 |
+
story_cloze_2016,3,median,accuracy,0.4612506680919295
|
429 |
+
story_cloze_2016,4,Answer Given options,acc,0.4607161945483699
|
430 |
+
story_cloze_2016,4,Choose Story Ending,acc,0.46018172100481025
|
431 |
+
story_cloze_2016,4,Novel Correct Ending,acc,0.4537680384820951
|
432 |
+
story_cloze_2016,4,Story Continuation and Options,acc,0.4569748797434527
|
433 |
+
story_cloze_2016,4,median,accuracy,0.4585783003741315
|
434 |
+
story_cloze_2016,5,Answer Given options,acc,0.467129877071085
|
435 |
+
story_cloze_2016,5,Choose Story Ending,acc,0.4580438268305719
|
436 |
+
story_cloze_2016,5,Novel Correct Ending,acc,0.4548369855692143
|
437 |
+
story_cloze_2016,5,Story Continuation and Options,acc,0.45056119722073756
|
438 |
+
story_cloze_2016,5,median,accuracy,0.4564404061998931
|
439 |
+
story_cloze_2016,5,average,multiple,0.46570461428825943
|
440 |
+
superglue_rte,0,GPT-3 style,acc,0.516245487364621
|
441 |
+
superglue_rte,0,MNLI crowdsource,acc,0.48375451263537905
|
442 |
+
superglue_rte,0,does it follow that,acc,0.48375451263537905
|
443 |
+
superglue_rte,0,guaranteed true,acc,0.5379061371841155
|
444 |
+
superglue_rte,0,should assume,acc,0.5018050541516246
|
445 |
+
superglue_rte,0,median,accuracy,0.5018050541516246
|
446 |
+
superglue_rte,1,GPT-3 style,acc,0.51985559566787
|
447 |
+
superglue_rte,1,MNLI crowdsource,acc,0.49097472924187724
|
448 |
+
superglue_rte,1,does it follow that,acc,0.49097472924187724
|
449 |
+
superglue_rte,1,guaranteed true,acc,0.49097472924187724
|
450 |
+
superglue_rte,1,should assume,acc,0.49097472924187724
|
451 |
+
superglue_rte,1,median,accuracy,0.49097472924187724
|
452 |
+
superglue_rte,2,GPT-3 style,acc,0.51985559566787
|
453 |
+
superglue_rte,2,MNLI crowdsource,acc,0.5018050541516246
|
454 |
+
superglue_rte,2,does it follow that,acc,0.51985559566787
|
455 |
+
superglue_rte,2,guaranteed true,acc,0.5018050541516246
|
456 |
+
superglue_rte,2,should assume,acc,0.5090252707581228
|
457 |
+
superglue_rte,2,median,accuracy,0.5090252707581228
|
458 |
+
superglue_rte,3,GPT-3 style,acc,0.5234657039711191
|
459 |
+
superglue_rte,3,MNLI crowdsource,acc,0.49458483754512633
|
460 |
+
superglue_rte,3,does it follow that,acc,0.516245487364621
|
461 |
+
superglue_rte,3,guaranteed true,acc,0.516245487364621
|
462 |
+
superglue_rte,3,should assume,acc,0.5270758122743683
|
463 |
+
superglue_rte,3,median,accuracy,0.516245487364621
|
464 |
+
superglue_rte,4,GPT-3 style,acc,0.5234657039711191
|
465 |
+
superglue_rte,4,MNLI crowdsource,acc,0.4584837545126354
|
466 |
+
superglue_rte,4,does it follow that,acc,0.516245487364621
|
467 |
+
superglue_rte,4,guaranteed true,acc,0.49458483754512633
|
468 |
+
superglue_rte,4,should assume,acc,0.516245487364621
|
469 |
+
superglue_rte,4,median,accuracy,0.516245487364621
|
470 |
+
superglue_rte,5,GPT-3 style,acc,0.5270758122743683
|
471 |
+
superglue_rte,5,MNLI crowdsource,acc,0.44765342960288806
|
472 |
+
superglue_rte,5,does it follow that,acc,0.4981949458483754
|
473 |
+
superglue_rte,5,guaranteed true,acc,0.47653429602888087
|
474 |
+
superglue_rte,5,should assume,acc,0.51985559566787
|
475 |
+
superglue_rte,5,median,accuracy,0.4981949458483754
|
476 |
+
superglue_rte,5,average,multiple,0.5054151624548736
|
477 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04998894903569846
|
478 |
+
web_nlg_en,0,explicit-graph-description2,rouge2_fmeasure,0.008447632785522565
|
479 |
+
web_nlg_en,0,implicit-graph-description,rouge2_fmeasure,0.004552592244101363
|
480 |
+
web_nlg_en,0,non-explicit-description,rouge2_fmeasure,0.012155444595988949
|
481 |
+
web_nlg_en,0,very-explicit-description,rouge2_fmeasure,1.477135016534563e-05
|
482 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.008447632785522565
|
483 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.051345397484036256
|
484 |
+
web_nlg_en,1,explicit-graph-description2,rouge2_fmeasure,0.09334851918279623
|
485 |
+
web_nlg_en,1,implicit-graph-description,rouge2_fmeasure,0.057734516186082864
|
486 |
+
web_nlg_en,1,non-explicit-description,rouge2_fmeasure,0.12050338190039989
|
487 |
+
web_nlg_en,1,very-explicit-description,rouge2_fmeasure,0.08099215111185744
|
488 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.08099215111185744
|
489 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.053828506115298144
|
490 |
+
web_nlg_en,2,explicit-graph-description2,rouge2_fmeasure,0.23334399501205214
|
491 |
+
web_nlg_en,2,implicit-graph-description,rouge2_fmeasure,0.07451843059126674
|
492 |
+
web_nlg_en,2,non-explicit-description,rouge2_fmeasure,0.13465046885050352
|
493 |
+
web_nlg_en,2,very-explicit-description,rouge2_fmeasure,0.11449533899405696
|
494 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.11449533899405696
|
495 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.051724489676439236
|
496 |
+
web_nlg_en,3,explicit-graph-description2,rouge2_fmeasure,0.2613220581162864
|
497 |
+
web_nlg_en,3,implicit-graph-description,rouge2_fmeasure,0.08217622838318879
|
498 |
+
web_nlg_en,3,non-explicit-description,rouge2_fmeasure,0.14344396360996736
|
499 |
+
web_nlg_en,3,very-explicit-description,rouge2_fmeasure,0.11630716134029079
|
500 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.11630716134029079
|
501 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.052942763106877684
|
502 |
+
web_nlg_en,4,explicit-graph-description2,rouge2_fmeasure,0.2637473868679139
|
503 |
+
web_nlg_en,4,implicit-graph-description,rouge2_fmeasure,0.08649984259029919
|
504 |
+
web_nlg_en,4,non-explicit-description,rouge2_fmeasure,0.14528102357273256
|
505 |
+
web_nlg_en,4,very-explicit-description,rouge2_fmeasure,0.11317180592447942
|
506 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.11317180592447942
|
507 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.054089458597439195
|
508 |
+
web_nlg_en,5,explicit-graph-description2,rouge2_fmeasure,0.2771416755952798
|
509 |
+
web_nlg_en,5,implicit-graph-description,rouge2_fmeasure,0.08905956074973984
|
510 |
+
web_nlg_en,5,non-explicit-description,rouge2_fmeasure,0.14969245864050304
|
511 |
+
web_nlg_en,5,very-explicit-description,rouge2_fmeasure,0.11088772044085522
|
512 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.11088772044085522
|
513 |
+
web_nlg_en,5,average,multiple,0.09071696843284373
|
514 |
+
wiki_lingua_en,0,article_summary_en,rouge2_fmeasure,0.0401546744710191
|
515 |
+
wiki_lingua_en,0,rephrase_en,rouge2_fmeasure,0.011775575519769068
|
516 |
+
wiki_lingua_en,0,summarize_above_en,rouge2_fmeasure,0.017208857113077958
|
517 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03393757157227001
|
518 |
+
wiki_lingua_en,0,write_abstract_en,rouge2_fmeasure,0.018163595558952392
|
519 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.018163595558952392
|
520 |
+
wiki_lingua_en,1,article_summary_en,rouge2_fmeasure,0.04582510801169016
|
521 |
+
wiki_lingua_en,1,rephrase_en,rouge2_fmeasure,0.028480082558715494
|
522 |
+
wiki_lingua_en,1,summarize_above_en,rouge2_fmeasure,0.03420510386300404
|
523 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05423182118294372
|
524 |
+
wiki_lingua_en,1,write_abstract_en,rouge2_fmeasure,0.015006978500477466
|
525 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.03420510386300404
|
526 |
+
wiki_lingua_en,2,article_summary_en,rouge2_fmeasure,0.05097602932656367
|
527 |
+
wiki_lingua_en,2,rephrase_en,rouge2_fmeasure,0.04342414284752008
|
528 |
+
wiki_lingua_en,2,summarize_above_en,rouge2_fmeasure,0.04558123756721243
|
529 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05619324678157442
|
530 |
+
wiki_lingua_en,2,write_abstract_en,rouge2_fmeasure,0.015011655554296155
|
531 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.04558123756721243
|
532 |
+
wiki_lingua_en,3,article_summary_en,rouge2_fmeasure,0.042273822347749235
|
533 |
+
wiki_lingua_en,3,rephrase_en,rouge2_fmeasure,0.03703687600072442
|
534 |
+
wiki_lingua_en,3,summarize_above_en,rouge2_fmeasure,0.04051754398323109
|
535 |
+
wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04775424867054453
|
536 |
+
wiki_lingua_en,3,write_abstract_en,rouge2_fmeasure,0.015436277594316166
|
537 |
+
wiki_lingua_en,3,median,rouge2_fmeasure,0.04051754398323109
|
538 |
+
wiki_lingua_en,4,article_summary_en,rouge2_fmeasure,0.013928620381908459
|
539 |
+
wiki_lingua_en,4,rephrase_en,rouge2_fmeasure,0.011604393750423206
|
540 |
+
wiki_lingua_en,4,summarize_above_en,rouge2_fmeasure,0.01222026826951555
|
541 |
+
wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.014607128479951145
|
542 |
+
wiki_lingua_en,4,write_abstract_en,rouge2_fmeasure,0.003915565527437388
|
543 |
+
wiki_lingua_en,4,median,rouge2_fmeasure,0.01222026826951555
|
544 |
+
wiki_lingua_en,5,article_summary_en,rouge2_fmeasure,0.0020632447863588753
|
545 |
+
wiki_lingua_en,5,rephrase_en,rouge2_fmeasure,0.0018773141620031116
|
546 |
+
wiki_lingua_en,5,summarize_above_en,rouge2_fmeasure,0.0014447433493777688
|
547 |
+
wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0027143726441978717
|
548 |
+
wiki_lingua_en,5,write_abstract_en,rouge2_fmeasure,0.00024341445452042402
|
549 |
+
wiki_lingua_en,5,median,rouge2_fmeasure,0.0018773141620031116
|
550 |
+
wiki_lingua_en,5,average,multiple,0.025427510567319768
|
551 |
+
winogrande,0,Replace,acc,0.5090765588003157
|
552 |
+
winogrande,0,True or False,acc,0.4956590370955012
|
553 |
+
winogrande,0,does underscore refer to,acc,0.5082872928176796
|
554 |
+
winogrande,0,stand for,acc,0.5082872928176796
|
555 |
+
winogrande,0,underscore refer to,acc,0.4956590370955012
|
556 |
+
winogrande,0,median,accuracy,0.5082872928176796
|
557 |
+
winogrande,1,Replace,acc,0.4964483030781373
|
558 |
+
winogrande,1,True or False,acc,0.5082872928176796
|
559 |
+
winogrande,1,does underscore refer to,acc,0.5074980268350434
|
560 |
+
winogrande,1,stand for,acc,0.4996053670086819
|
561 |
+
winogrande,1,underscore refer to,acc,0.4980268350434096
|
562 |
+
winogrande,1,median,accuracy,0.4996053670086819
|
563 |
+
winogrande,2,Replace,acc,0.4846093133385951
|
564 |
+
winogrande,2,True or False,acc,0.489344909234412
|
565 |
+
winogrande,2,does underscore refer to,acc,0.5122336227308603
|
566 |
+
winogrande,2,stand for,acc,0.5043409629044988
|
567 |
+
winogrande,2,underscore refer to,acc,0.4988161010260458
|
568 |
+
winogrande,2,median,accuracy,0.4988161010260458
|
569 |
+
winogrande,3,Replace,acc,0.5019731649565904
|
570 |
+
winogrande,3,True or False,acc,0.49013417521704816
|
571 |
+
winogrande,3,does underscore refer to,acc,0.5240726124704025
|
572 |
+
winogrande,3,stand for,acc,0.4940805051302289
|
573 |
+
winogrande,3,underscore refer to,acc,0.5153906866614049
|
574 |
+
winogrande,3,median,accuracy,0.5019731649565904
|
575 |
+
winogrande,4,Replace,acc,0.4996053670086819
|
576 |
+
winogrande,4,True or False,acc,0.5035516969218626
|
577 |
+
winogrande,4,does underscore refer to,acc,0.5169692186266772
|
578 |
+
winogrande,4,stand for,acc,0.505130228887135
|
579 |
+
winogrande,4,underscore refer to,acc,0.5256511444356748
|
580 |
+
winogrande,4,median,accuracy,0.505130228887135
|
581 |
+
winogrande,5,Replace,acc,0.5035516969218626
|
582 |
+
winogrande,5,True or False,acc,0.505130228887135
|
583 |
+
winogrande,5,does underscore refer to,acc,0.5169692186266772
|
584 |
+
winogrande,5,stand for,acc,0.5327545382794001
|
585 |
+
winogrande,5,underscore refer to,acc,0.5256511444356748
|
586 |
+
winogrande,5,median,accuracy,0.5169692186266772
|
587 |
+
winogrande,5,average,multiple,0.505130228887135
|
4b284b21bc4/eval/merged.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
4b284b28bc4/eval/merged.csv
ADDED
@@ -0,0 +1,587 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
anli_r1,0,GPT-3 style,acc,0.329
|
3 |
+
anli_r1,0,MNLI crowdsource,acc,0.334
|
4 |
+
anli_r1,0,can we infer,acc,0.334
|
5 |
+
anli_r1,0,guaranteed/possible/impossible,acc,0.332
|
6 |
+
anli_r1,0,justified in saying,acc,0.344
|
7 |
+
anli_r1,0,median,accuracy,0.334
|
8 |
+
anli_r1,1,GPT-3 style,acc,0.355
|
9 |
+
anli_r1,1,MNLI crowdsource,acc,0.333
|
10 |
+
anli_r1,1,can we infer,acc,0.333
|
11 |
+
anli_r1,1,guaranteed/possible/impossible,acc,0.343
|
12 |
+
anli_r1,1,justified in saying,acc,0.332
|
13 |
+
anli_r1,1,median,accuracy,0.333
|
14 |
+
anli_r1,2,GPT-3 style,acc,0.36
|
15 |
+
anli_r1,2,MNLI crowdsource,acc,0.352
|
16 |
+
anli_r1,2,can we infer,acc,0.355
|
17 |
+
anli_r1,2,guaranteed/possible/impossible,acc,0.32
|
18 |
+
anli_r1,2,justified in saying,acc,0.351
|
19 |
+
anli_r1,2,median,accuracy,0.352
|
20 |
+
anli_r1,3,GPT-3 style,acc,0.363
|
21 |
+
anli_r1,3,MNLI crowdsource,acc,0.361
|
22 |
+
anli_r1,3,can we infer,acc,0.36
|
23 |
+
anli_r1,3,guaranteed/possible/impossible,acc,0.326
|
24 |
+
anli_r1,3,justified in saying,acc,0.347
|
25 |
+
anli_r1,3,median,accuracy,0.36
|
26 |
+
anli_r1,4,GPT-3 style,acc,0.349
|
27 |
+
anli_r1,4,MNLI crowdsource,acc,0.35
|
28 |
+
anli_r1,4,can we infer,acc,0.334
|
29 |
+
anli_r1,4,guaranteed/possible/impossible,acc,0.332
|
30 |
+
anli_r1,4,justified in saying,acc,0.331
|
31 |
+
anli_r1,4,median,accuracy,0.334
|
32 |
+
anli_r1,5,GPT-3 style,acc,0.364
|
33 |
+
anli_r1,5,MNLI crowdsource,acc,0.351
|
34 |
+
anli_r1,5,can we infer,acc,0.338
|
35 |
+
anli_r1,5,guaranteed/possible/impossible,acc,0.33
|
36 |
+
anli_r1,5,justified in saying,acc,0.33
|
37 |
+
anli_r1,5,median,accuracy,0.338
|
38 |
+
anli_r1,5,average,multiple,0.3418333333333333
|
39 |
+
anli_r2,0,GPT-3 style,acc,0.333
|
40 |
+
anli_r2,0,MNLI crowdsource,acc,0.334
|
41 |
+
anli_r2,0,can we infer,acc,0.329
|
42 |
+
anli_r2,0,guaranteed/possible/impossible,acc,0.333
|
43 |
+
anli_r2,0,justified in saying,acc,0.331
|
44 |
+
anli_r2,0,median,accuracy,0.333
|
45 |
+
anli_r2,1,GPT-3 style,acc,0.315
|
46 |
+
anli_r2,1,MNLI crowdsource,acc,0.315
|
47 |
+
anli_r2,1,can we infer,acc,0.315
|
48 |
+
anli_r2,1,guaranteed/possible/impossible,acc,0.311
|
49 |
+
anli_r2,1,justified in saying,acc,0.315
|
50 |
+
anli_r2,1,median,accuracy,0.315
|
51 |
+
anli_r2,2,GPT-3 style,acc,0.334
|
52 |
+
anli_r2,2,MNLI crowdsource,acc,0.316
|
53 |
+
anli_r2,2,can we infer,acc,0.324
|
54 |
+
anli_r2,2,guaranteed/possible/impossible,acc,0.326
|
55 |
+
anli_r2,2,justified in saying,acc,0.32
|
56 |
+
anli_r2,2,median,accuracy,0.324
|
57 |
+
anli_r2,3,GPT-3 style,acc,0.326
|
58 |
+
anli_r2,3,MNLI crowdsource,acc,0.317
|
59 |
+
anli_r2,3,can we infer,acc,0.324
|
60 |
+
anli_r2,3,guaranteed/possible/impossible,acc,0.341
|
61 |
+
anli_r2,3,justified in saying,acc,0.324
|
62 |
+
anli_r2,3,median,accuracy,0.324
|
63 |
+
anli_r2,4,GPT-3 style,acc,0.34
|
64 |
+
anli_r2,4,MNLI crowdsource,acc,0.32
|
65 |
+
anli_r2,4,can we infer,acc,0.314
|
66 |
+
anli_r2,4,guaranteed/possible/impossible,acc,0.332
|
67 |
+
anli_r2,4,justified in saying,acc,0.317
|
68 |
+
anli_r2,4,median,accuracy,0.32
|
69 |
+
anli_r2,5,GPT-3 style,acc,0.317
|
70 |
+
anli_r2,5,MNLI crowdsource,acc,0.312
|
71 |
+
anli_r2,5,can we infer,acc,0.321
|
72 |
+
anli_r2,5,guaranteed/possible/impossible,acc,0.339
|
73 |
+
anli_r2,5,justified in saying,acc,0.331
|
74 |
+
anli_r2,5,median,accuracy,0.321
|
75 |
+
anli_r2,5,average,multiple,0.32283333333333336
|
76 |
+
anli_r3,0,GPT-3 style,acc,0.3275
|
77 |
+
anli_r3,0,MNLI crowdsource,acc,0.3375
|
78 |
+
anli_r3,0,can we infer,acc,0.32666666666666666
|
79 |
+
anli_r3,0,guaranteed/possible/impossible,acc,0.3075
|
80 |
+
anli_r3,0,justified in saying,acc,0.3475
|
81 |
+
anli_r3,0,median,accuracy,0.3275
|
82 |
+
anli_r3,1,GPT-3 style,acc,0.335
|
83 |
+
anli_r3,1,MNLI crowdsource,acc,0.335
|
84 |
+
anli_r3,1,can we infer,acc,0.33666666666666667
|
85 |
+
anli_r3,1,guaranteed/possible/impossible,acc,0.3375
|
86 |
+
anli_r3,1,justified in saying,acc,0.3358333333333333
|
87 |
+
anli_r3,1,median,accuracy,0.3358333333333333
|
88 |
+
anli_r3,2,GPT-3 style,acc,0.32166666666666666
|
89 |
+
anli_r3,2,MNLI crowdsource,acc,0.32916666666666666
|
90 |
+
anli_r3,2,can we infer,acc,0.31333333333333335
|
91 |
+
anli_r3,2,guaranteed/possible/impossible,acc,0.32083333333333336
|
92 |
+
anli_r3,2,justified in saying,acc,0.32
|
93 |
+
anli_r3,2,median,accuracy,0.32083333333333336
|
94 |
+
anli_r3,3,GPT-3 style,acc,0.33166666666666667
|
95 |
+
anli_r3,3,MNLI crowdsource,acc,0.3425
|
96 |
+
anli_r3,3,can we infer,acc,0.3433333333333333
|
97 |
+
anli_r3,3,guaranteed/possible/impossible,acc,0.3275
|
98 |
+
anli_r3,3,justified in saying,acc,0.3525
|
99 |
+
anli_r3,3,median,accuracy,0.3425
|
100 |
+
anli_r3,4,GPT-3 style,acc,0.32166666666666666
|
101 |
+
anli_r3,4,MNLI crowdsource,acc,0.335
|
102 |
+
anli_r3,4,can we infer,acc,0.3225
|
103 |
+
anli_r3,4,guaranteed/possible/impossible,acc,0.3408333333333333
|
104 |
+
anli_r3,4,justified in saying,acc,0.31916666666666665
|
105 |
+
anli_r3,4,median,accuracy,0.3225
|
106 |
+
anli_r3,5,GPT-3 style,acc,0.315
|
107 |
+
anli_r3,5,MNLI crowdsource,acc,0.31833333333333336
|
108 |
+
anli_r3,5,can we infer,acc,0.31166666666666665
|
109 |
+
anli_r3,5,guaranteed/possible/impossible,acc,0.3375
|
110 |
+
anli_r3,5,justified in saying,acc,0.315
|
111 |
+
anli_r3,5,median,accuracy,0.315
|
112 |
+
anli_r3,5,average,multiple,0.3273611111111111
|
113 |
+
arc_easy,0,heres_a_problem,acc,0.255050505050505
|
114 |
+
arc_easy,0,i_am_hesitating,acc,0.35185185185185186
|
115 |
+
arc_easy,0,multiple_choice,acc,0.2354948805460751
|
116 |
+
arc_easy,0,pick_the_most_correct_option,acc,0.2563131313131313
|
117 |
+
arc_easy,0,qa_options,acc,0.35395622895622897
|
118 |
+
arc_easy,0,median,accuracy,0.2563131313131313
|
119 |
+
arc_easy,1,heres_a_problem,acc,0.23208191126279865
|
120 |
+
arc_easy,1,i_am_hesitating,acc,0.2713310580204778
|
121 |
+
arc_easy,1,multiple_choice,acc,0.25
|
122 |
+
arc_easy,1,pick_the_most_correct_option,acc,0.24284511784511784
|
123 |
+
arc_easy,1,qa_options,acc,0.3291245791245791
|
124 |
+
arc_easy,1,median,accuracy,0.25
|
125 |
+
arc_easy,2,heres_a_problem,acc,0.2558922558922559
|
126 |
+
arc_easy,2,i_am_hesitating,acc,0.3333333333333333
|
127 |
+
arc_easy,2,multiple_choice,acc,0.3282828282828283
|
128 |
+
arc_easy,2,pick_the_most_correct_option,acc,0.2563131313131313
|
129 |
+
arc_easy,2,qa_options,acc,0.32154882154882153
|
130 |
+
arc_easy,2,median,accuracy,0.32154882154882153
|
131 |
+
arc_easy,3,heres_a_problem,acc,0.22866894197952217
|
132 |
+
arc_easy,3,i_am_hesitating,acc,0.335016835016835
|
133 |
+
arc_easy,3,multiple_choice,acc,0.26023890784982934
|
134 |
+
arc_easy,3,pick_the_most_correct_option,acc,0.24621212121212122
|
135 |
+
arc_easy,3,qa_options,acc,0.2738907849829352
|
136 |
+
arc_easy,3,median,accuracy,0.26023890784982934
|
137 |
+
arc_easy,4,heres_a_problem,acc,0.24061433447098976
|
138 |
+
arc_easy,4,i_am_hesitating,acc,0.32407407407407407
|
139 |
+
arc_easy,4,multiple_choice,acc,0.26535836177474403
|
140 |
+
arc_easy,4,pick_the_most_correct_option,acc,0.24705387205387205
|
141 |
+
arc_easy,4,qa_options,acc,0.26023890784982934
|
142 |
+
arc_easy,4,median,accuracy,0.26023890784982934
|
143 |
+
arc_easy,5,heres_a_problem,acc,0.24663299663299662
|
144 |
+
arc_easy,5,i_am_hesitating,acc,0.3202861952861953
|
145 |
+
arc_easy,5,multiple_choice,acc,0.257679180887372
|
146 |
+
arc_easy,5,pick_the_most_correct_option,acc,0.25252525252525254
|
147 |
+
arc_easy,5,qa_options,acc,0.3164983164983165
|
148 |
+
arc_easy,5,median,accuracy,0.257679180887372
|
149 |
+
arc_easy,5,average,multiple,0.26766982490816393
|
150 |
+
boolq,0,GPT-3 Style,acc,0.589
|
151 |
+
boolq,0,after_reading,acc,0.6206666666666667
|
152 |
+
boolq,0,exercise,acc,0.6226666666666667
|
153 |
+
boolq,0,valid_binary,acc,0.49766666666666665
|
154 |
+
boolq,0,yes_no_question,acc,0.38966666666666666
|
155 |
+
boolq,0,median,accuracy,0.589
|
156 |
+
boolq,1,GPT-3 Style,acc,0.6156666666666667
|
157 |
+
boolq,1,after_reading,acc,0.5406666666666666
|
158 |
+
boolq,1,exercise,acc,0.5423333333333333
|
159 |
+
boolq,1,valid_binary,acc,0.5426666666666666
|
160 |
+
boolq,1,yes_no_question,acc,0.5406666666666666
|
161 |
+
boolq,1,median,accuracy,0.5423333333333333
|
162 |
+
boolq,2,GPT-3 Style,acc,0.6273333333333333
|
163 |
+
boolq,2,after_reading,acc,0.5963333333333334
|
164 |
+
boolq,2,exercise,acc,0.5473333333333333
|
165 |
+
boolq,2,valid_binary,acc,0.5913333333333334
|
166 |
+
boolq,2,yes_no_question,acc,0.595
|
167 |
+
boolq,2,median,accuracy,0.595
|
168 |
+
boolq,3,GPT-3 Style,acc,0.6313333333333333
|
169 |
+
boolq,3,after_reading,acc,0.613
|
170 |
+
boolq,3,exercise,acc,0.546
|
171 |
+
boolq,3,valid_binary,acc,0.6136666666666667
|
172 |
+
boolq,3,yes_no_question,acc,0.6096666666666667
|
173 |
+
boolq,3,median,accuracy,0.613
|
174 |
+
boolq,4,GPT-3 Style,acc,0.6323333333333333
|
175 |
+
boolq,4,after_reading,acc,0.6173333333333333
|
176 |
+
boolq,4,exercise,acc,0.5476666666666666
|
177 |
+
boolq,4,valid_binary,acc,0.6156666666666667
|
178 |
+
boolq,4,yes_no_question,acc,0.6206666666666667
|
179 |
+
boolq,4,median,accuracy,0.6173333333333333
|
180 |
+
boolq,5,GPT-3 Style,acc,0.6276666666666667
|
181 |
+
boolq,5,after_reading,acc,0.62
|
182 |
+
boolq,5,exercise,acc,0.5383333333333333
|
183 |
+
boolq,5,valid_binary,acc,0.6183333333333333
|
184 |
+
boolq,5,yes_no_question,acc,0.616
|
185 |
+
boolq,5,median,accuracy,0.6183333333333333
|
186 |
+
boolq,5,average,multiple,0.5958333333333333
|
187 |
+
cb,0,GPT-3 style,acc,0.39285714285714285
|
188 |
+
cb,0,MNLI crowdsource,acc,0.39285714285714285
|
189 |
+
cb,0,can we infer,acc,0.39285714285714285
|
190 |
+
cb,0,guaranteed/possible/impossible,acc,0.30357142857142855
|
191 |
+
cb,0,justified in saying,acc,0.3392857142857143
|
192 |
+
cb,0,median,accuracy,0.39285714285714285
|
193 |
+
cb,1,GPT-3 style,acc,0.39285714285714285
|
194 |
+
cb,1,MNLI crowdsource,acc,0.39285714285714285
|
195 |
+
cb,1,can we infer,acc,0.39285714285714285
|
196 |
+
cb,1,guaranteed/possible/impossible,acc,0.35714285714285715
|
197 |
+
cb,1,justified in saying,acc,0.39285714285714285
|
198 |
+
cb,1,median,accuracy,0.39285714285714285
|
199 |
+
cb,2,GPT-3 style,acc,0.44642857142857145
|
200 |
+
cb,2,MNLI crowdsource,acc,0.44642857142857145
|
201 |
+
cb,2,can we infer,acc,0.44642857142857145
|
202 |
+
cb,2,guaranteed/possible/impossible,acc,0.3392857142857143
|
203 |
+
cb,2,justified in saying,acc,0.44642857142857145
|
204 |
+
cb,2,median,accuracy,0.44642857142857145
|
205 |
+
cb,3,GPT-3 style,acc,0.44642857142857145
|
206 |
+
cb,3,MNLI crowdsource,acc,0.3392857142857143
|
207 |
+
cb,3,can we infer,acc,0.44642857142857145
|
208 |
+
cb,3,guaranteed/possible/impossible,acc,0.26785714285714285
|
209 |
+
cb,3,justified in saying,acc,0.39285714285714285
|
210 |
+
cb,3,median,accuracy,0.39285714285714285
|
211 |
+
cb,4,GPT-3 style,acc,0.48214285714285715
|
212 |
+
cb,4,MNLI crowdsource,acc,0.35714285714285715
|
213 |
+
cb,4,can we infer,acc,0.44642857142857145
|
214 |
+
cb,4,guaranteed/possible/impossible,acc,0.21428571428571427
|
215 |
+
cb,4,justified in saying,acc,0.4107142857142857
|
216 |
+
cb,4,median,accuracy,0.4107142857142857
|
217 |
+
cb,5,GPT-3 style,acc,0.44642857142857145
|
218 |
+
cb,5,MNLI crowdsource,acc,0.375
|
219 |
+
cb,5,can we infer,acc,0.4107142857142857
|
220 |
+
cb,5,guaranteed/possible/impossible,acc,0.21428571428571427
|
221 |
+
cb,5,justified in saying,acc,0.44642857142857145
|
222 |
+
cb,5,median,accuracy,0.4107142857142857
|
223 |
+
cb,5,average,multiple,0.40773809523809523
|
224 |
+
copa,0,best_option,acc,0.6
|
225 |
+
copa,0,cause_effect,acc,0.6
|
226 |
+
copa,0,choose,acc,0.6
|
227 |
+
copa,0,i_am_hesitating,acc,0.56
|
228 |
+
copa,0,plausible_alternatives,acc,0.57
|
229 |
+
copa,0,median,accuracy,0.6
|
230 |
+
copa,1,best_option,acc,0.5
|
231 |
+
copa,1,cause_effect,acc,0.46
|
232 |
+
copa,1,choose,acc,0.48
|
233 |
+
copa,1,i_am_hesitating,acc,0.47
|
234 |
+
copa,1,plausible_alternatives,acc,0.46
|
235 |
+
copa,1,median,accuracy,0.47
|
236 |
+
copa,2,best_option,acc,0.48
|
237 |
+
copa,2,cause_effect,acc,0.43
|
238 |
+
copa,2,choose,acc,0.47
|
239 |
+
copa,2,i_am_hesitating,acc,0.42
|
240 |
+
copa,2,plausible_alternatives,acc,0.44
|
241 |
+
copa,2,median,accuracy,0.44
|
242 |
+
copa,3,best_option,acc,0.52
|
243 |
+
copa,3,cause_effect,acc,0.45
|
244 |
+
copa,3,choose,acc,0.44
|
245 |
+
copa,3,i_am_hesitating,acc,0.48
|
246 |
+
copa,3,plausible_alternatives,acc,0.43
|
247 |
+
copa,3,median,accuracy,0.45
|
248 |
+
copa,4,best_option,acc,0.53
|
249 |
+
copa,4,cause_effect,acc,0.47
|
250 |
+
copa,4,choose,acc,0.43
|
251 |
+
copa,4,i_am_hesitating,acc,0.45
|
252 |
+
copa,4,plausible_alternatives,acc,0.46
|
253 |
+
copa,4,median,accuracy,0.46
|
254 |
+
copa,5,best_option,acc,0.5
|
255 |
+
copa,5,cause_effect,acc,0.47
|
256 |
+
copa,5,choose,acc,0.5
|
257 |
+
copa,5,i_am_hesitating,acc,0.49
|
258 |
+
copa,5,plausible_alternatives,acc,0.46
|
259 |
+
copa,5,median,accuracy,0.49
|
260 |
+
copa,5,average,multiple,0.485
|
261 |
+
e2e_nlg_cleaned,0,coherent_text,rouge2_fmeasure,0.07227415925772734
|
262 |
+
e2e_nlg_cleaned,0,create_text_for_me,rouge2_fmeasure,0.01749967959480122
|
263 |
+
e2e_nlg_cleaned,0,generate_gramatically_correct_text,rouge2_fmeasure,5.466015466015465e-05
|
264 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.010022915068112901
|
265 |
+
e2e_nlg_cleaned,0,text,rouge2_fmeasure,0.053555609070310047
|
266 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.01749967959480122
|
267 |
+
e2e_nlg_cleaned,1,coherent_text,rouge2_fmeasure,0.1874037992560188
|
268 |
+
e2e_nlg_cleaned,1,create_text_for_me,rouge2_fmeasure,0.16624694535550544
|
269 |
+
e2e_nlg_cleaned,1,generate_gramatically_correct_text,rouge2_fmeasure,0.034204603336784774
|
270 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.20489136085595536
|
271 |
+
e2e_nlg_cleaned,1,text,rouge2_fmeasure,0.20027753375836946
|
272 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1874037992560188
|
273 |
+
e2e_nlg_cleaned,2,coherent_text,rouge2_fmeasure,0.18976901148520062
|
274 |
+
e2e_nlg_cleaned,2,create_text_for_me,rouge2_fmeasure,0.1773205809463223
|
275 |
+
e2e_nlg_cleaned,2,generate_gramatically_correct_text,rouge2_fmeasure,0.07093954631020417
|
276 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2325284196471626
|
277 |
+
e2e_nlg_cleaned,2,text,rouge2_fmeasure,0.1872354549740021
|
278 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.1872354549740021
|
279 |
+
e2e_nlg_cleaned,3,coherent_text,rouge2_fmeasure,0.19128265567370034
|
280 |
+
e2e_nlg_cleaned,3,create_text_for_me,rouge2_fmeasure,0.1814061261607152
|
281 |
+
e2e_nlg_cleaned,3,generate_gramatically_correct_text,rouge2_fmeasure,0.09632029448061452
|
282 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.24388713793667496
|
283 |
+
e2e_nlg_cleaned,3,text,rouge2_fmeasure,0.18544033119790024
|
284 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.18544033119790024
|
285 |
+
e2e_nlg_cleaned,4,coherent_text,rouge2_fmeasure,0.18797801160921448
|
286 |
+
e2e_nlg_cleaned,4,create_text_for_me,rouge2_fmeasure,0.1839557774098788
|
287 |
+
e2e_nlg_cleaned,4,generate_gramatically_correct_text,rouge2_fmeasure,0.11048253025142175
|
288 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.24852828649672148
|
289 |
+
e2e_nlg_cleaned,4,text,rouge2_fmeasure,0.18624803563290596
|
290 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.18624803563290596
|
291 |
+
e2e_nlg_cleaned,5,coherent_text,rouge2_fmeasure,0.18412349501175196
|
292 |
+
e2e_nlg_cleaned,5,create_text_for_me,rouge2_fmeasure,0.1840109955212137
|
293 |
+
e2e_nlg_cleaned,5,generate_gramatically_correct_text,rouge2_fmeasure,0.11427201259332177
|
294 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.24634621400768708
|
295 |
+
e2e_nlg_cleaned,5,text,rouge2_fmeasure,0.18362949663211528
|
296 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.1840109955212137
|
297 |
+
e2e_nlg_cleaned,5,average,multiple,0.157973049362807
|
298 |
+
gem_xsum,0,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.021087066079578522
|
299 |
+
gem_xsum,0,DOC_tldr,rouge2_fmeasure,0.05712816335799515
|
300 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.0511787638415587
|
301 |
+
gem_xsum,0,summarize_DOC,rouge2_fmeasure,0.04876836268401361
|
302 |
+
gem_xsum,0,summarize_this_DOC_summary,rouge2_fmeasure,0.058199865858872574
|
303 |
+
gem_xsum,0,median,rouge2_fmeasure,0.0511787638415587
|
304 |
+
gem_xsum,1,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.01583906203117334
|
305 |
+
gem_xsum,1,DOC_tldr,rouge2_fmeasure,0.052391077282527426
|
306 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04515071736102295
|
307 |
+
gem_xsum,1,summarize_DOC,rouge2_fmeasure,0.044490758416455556
|
308 |
+
gem_xsum,1,summarize_this_DOC_summary,rouge2_fmeasure,0.04062921012242493
|
309 |
+
gem_xsum,1,median,rouge2_fmeasure,0.044490758416455556
|
310 |
+
gem_xsum,2,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.025696677971347164
|
311 |
+
gem_xsum,2,DOC_tldr,rouge2_fmeasure,0.05747020905233346
|
312 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.047730927310845786
|
313 |
+
gem_xsum,2,summarize_DOC,rouge2_fmeasure,0.044164071082009024
|
314 |
+
gem_xsum,2,summarize_this_DOC_summary,rouge2_fmeasure,0.04062641993018198
|
315 |
+
gem_xsum,2,median,rouge2_fmeasure,0.044164071082009024
|
316 |
+
gem_xsum,3,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.03211728661220163
|
317 |
+
gem_xsum,3,DOC_tldr,rouge2_fmeasure,0.05413486443607653
|
318 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04656621187743751
|
319 |
+
gem_xsum,3,summarize_DOC,rouge2_fmeasure,0.045340440062370646
|
320 |
+
gem_xsum,3,summarize_this_DOC_summary,rouge2_fmeasure,0.040076565671093634
|
321 |
+
gem_xsum,3,median,rouge2_fmeasure,0.045340440062370646
|
322 |
+
gem_xsum,4,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.009527033234766386
|
323 |
+
gem_xsum,4,DOC_tldr,rouge2_fmeasure,0.013394347210809258
|
324 |
+
gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010817994039374855
|
325 |
+
gem_xsum,4,summarize_DOC,rouge2_fmeasure,0.012207544410113281
|
326 |
+
gem_xsum,4,summarize_this_DOC_summary,rouge2_fmeasure,0.00916357714654539
|
327 |
+
gem_xsum,4,median,rouge2_fmeasure,0.010817994039374855
|
328 |
+
gem_xsum,5,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0
|
329 |
+
gem_xsum,5,DOC_tldr,rouge2_fmeasure,0.0002940707111925786
|
330 |
+
gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0001299594149643802
|
331 |
+
gem_xsum,5,summarize_DOC,rouge2_fmeasure,0.0005146148795680421
|
332 |
+
gem_xsum,5,summarize_this_DOC_summary,rouge2_fmeasure,0.0
|
333 |
+
gem_xsum,5,median,rouge2_fmeasure,0.0001299594149643802
|
334 |
+
gem_xsum,5,average,multiple,0.032686997809455526
|
335 |
+
piqa,0,Correct the solution,rouge2_fmeasure,0.24351260224849494
|
336 |
+
piqa,0,choose the most appropriate solution,acc,0.49510337323177367
|
337 |
+
piqa,0,no prompt needed,rouge2_fmeasure,0.005621669068984276
|
338 |
+
piqa,0,pick_correct_choice_index,acc,0.49510337323177367
|
339 |
+
piqa,0,what_is_the_correct_ending,acc,0.5669205658324266
|
340 |
+
piqa,0,median,accuracy,0.49510337323177367
|
341 |
+
piqa,1,Correct the solution,rouge2_fmeasure,0.2728457121204912
|
342 |
+
piqa,1,choose the most appropriate solution,acc,0.5021762785636561
|
343 |
+
piqa,1,no prompt needed,rouge2_fmeasure,0.005440119305280117
|
344 |
+
piqa,1,pick_correct_choice_index,acc,0.500544069640914
|
345 |
+
piqa,1,what_is_the_correct_ending,acc,0.5495103373231773
|
346 |
+
piqa,1,median,accuracy,0.5021762785636561
|
347 |
+
piqa,2,Correct the solution,rouge2_fmeasure,0.4813163134151628
|
348 |
+
piqa,2,choose the most appropriate solution,acc,0.4929270946681175
|
349 |
+
piqa,2,no prompt needed,rouge2_fmeasure,0.004831334688117916
|
350 |
+
piqa,2,pick_correct_choice_index,acc,0.48748639825897716
|
351 |
+
piqa,2,what_is_the_correct_ending,acc,0.529923830250272
|
352 |
+
piqa,2,median,accuracy,0.4929270946681175
|
353 |
+
piqa,3,Correct the solution,rouge2_fmeasure,0.5130291514276495
|
354 |
+
piqa,3,choose the most appropriate solution,acc,0.5065288356909684
|
355 |
+
piqa,3,no prompt needed,rouge2_fmeasure,0.004497352970943405
|
356 |
+
piqa,3,pick_correct_choice_index,acc,0.4776931447225245
|
357 |
+
piqa,3,what_is_the_correct_ending,acc,0.529379760609358
|
358 |
+
piqa,3,median,accuracy,0.5065288356909684
|
359 |
+
piqa,4,Correct the solution,rouge2_fmeasure,0.5191957202312659
|
360 |
+
piqa,4,choose the most appropriate solution,acc,0.5059847660500544
|
361 |
+
piqa,4,no prompt needed,rouge2_fmeasure,0.00396783046089762
|
362 |
+
piqa,4,pick_correct_choice_index,acc,0.5021762785636561
|
363 |
+
piqa,4,what_is_the_correct_ending,acc,0.5277475516866159
|
364 |
+
piqa,4,median,accuracy,0.5059847660500544
|
365 |
+
piqa,5,Correct the solution,rouge2_fmeasure,0.5349097989485111
|
366 |
+
piqa,5,choose the most appropriate solution,acc,0.5032644178454843
|
367 |
+
piqa,5,no prompt needed,rouge2_fmeasure,0.0042412326403525056
|
368 |
+
piqa,5,pick_correct_choice_index,acc,0.4967355821545158
|
369 |
+
piqa,5,what_is_the_correct_ending,acc,0.5348204570184983
|
370 |
+
piqa,5,median,accuracy,0.5032644178454843
|
371 |
+
piqa,5,average,multiple,0.5009974610083424
|
372 |
+
sciq,0,Direct Question,acc,0.866
|
373 |
+
sciq,0,Direct Question (Closed Book),acc,0.617
|
374 |
+
sciq,0,Multiple Choice,acc,0.583
|
375 |
+
sciq,0,Multiple Choice (Closed Book),acc,0.46
|
376 |
+
sciq,0,Multiple Choice Question First,acc,0.534
|
377 |
+
sciq,0,median,accuracy,0.583
|
378 |
+
sciq,1,Direct Question,acc,0.9
|
379 |
+
sciq,1,Direct Question (Closed Book),acc,0.675
|
380 |
+
sciq,1,Multiple Choice,acc,0.507
|
381 |
+
sciq,1,Multiple Choice (Closed Book),acc,0.457
|
382 |
+
sciq,1,Multiple Choice Question First,acc,0.387
|
383 |
+
sciq,1,median,accuracy,0.507
|
384 |
+
sciq,2,Direct Question,acc,0.901
|
385 |
+
sciq,2,Direct Question (Closed Book),acc,0.689
|
386 |
+
sciq,2,Multiple Choice,acc,0.548
|
387 |
+
sciq,2,Multiple Choice (Closed Book),acc,0.543
|
388 |
+
sciq,2,Multiple Choice Question First,acc,0.42
|
389 |
+
sciq,2,median,accuracy,0.548
|
390 |
+
sciq,3,Direct Question,acc,0.911
|
391 |
+
sciq,3,Direct Question (Closed Book),acc,0.696
|
392 |
+
sciq,3,Multiple Choice,acc,0.575
|
393 |
+
sciq,3,Multiple Choice (Closed Book),acc,0.57
|
394 |
+
sciq,3,Multiple Choice Question First,acc,0.42
|
395 |
+
sciq,3,median,accuracy,0.575
|
396 |
+
sciq,4,Direct Question,acc,0.904
|
397 |
+
sciq,4,Direct Question (Closed Book),acc,0.709
|
398 |
+
sciq,4,Multiple Choice,acc,0.584
|
399 |
+
sciq,4,Multiple Choice (Closed Book),acc,0.565
|
400 |
+
sciq,4,Multiple Choice Question First,acc,0.445
|
401 |
+
sciq,4,median,accuracy,0.584
|
402 |
+
sciq,5,Direct Question,acc,0.906
|
403 |
+
sciq,5,Direct Question (Closed Book),acc,0.714
|
404 |
+
sciq,5,Multiple Choice,acc,0.581
|
405 |
+
sciq,5,Multiple Choice (Closed Book),acc,0.579
|
406 |
+
sciq,5,Multiple Choice Question First,acc,0.462
|
407 |
+
sciq,5,median,accuracy,0.581
|
408 |
+
sciq,5,average,multiple,0.563
|
409 |
+
story_cloze_2016,0,Answer Given options,acc,0.49706039551042225
|
410 |
+
story_cloze_2016,0,Choose Story Ending,acc,0.48957776590058794
|
411 |
+
story_cloze_2016,0,Novel Correct Ending,acc,0.4879743452699091
|
412 |
+
story_cloze_2016,0,Story Continuation and Options,acc,0.49438802779262425
|
413 |
+
story_cloze_2016,0,median,accuracy,0.4919828968466061
|
414 |
+
story_cloze_2016,1,Answer Given options,acc,0.4853019775521112
|
415 |
+
story_cloze_2016,1,Choose Story Ending,acc,0.4906467129877071
|
416 |
+
story_cloze_2016,1,Novel Correct Ending,acc,0.48102618920363444
|
417 |
+
story_cloze_2016,1,Story Continuation and Options,acc,0.4917156600748263
|
418 |
+
story_cloze_2016,1,median,accuracy,0.4879743452699091
|
419 |
+
story_cloze_2016,2,Answer Given options,acc,0.47888829502939606
|
420 |
+
story_cloze_2016,2,Choose Story Ending,acc,0.47728487439871725
|
421 |
+
story_cloze_2016,2,Novel Correct Ending,acc,0.4751469802244789
|
422 |
+
story_cloze_2016,2,Story Continuation and Options,acc,0.47995724211651525
|
423 |
+
story_cloze_2016,2,median,accuracy,0.47808658471405663
|
424 |
+
story_cloze_2016,3,Answer Given options,acc,0.4735435595938001
|
425 |
+
story_cloze_2016,3,Choose Story Ending,acc,0.4820951362907536
|
426 |
+
story_cloze_2016,3,Novel Correct Ending,acc,0.4740780331373597
|
427 |
+
story_cloze_2016,3,Story Continuation and Options,acc,0.4901122394441475
|
428 |
+
story_cloze_2016,3,median,accuracy,0.47808658471405663
|
429 |
+
story_cloze_2016,4,Answer Given options,acc,0.46178514163548906
|
430 |
+
story_cloze_2016,4,Choose Story Ending,acc,0.4730090860502405
|
431 |
+
story_cloze_2016,4,Novel Correct Ending,acc,0.4681988241582042
|
432 |
+
story_cloze_2016,4,Story Continuation and Options,acc,0.4879743452699091
|
433 |
+
story_cloze_2016,4,median,accuracy,0.47060395510422237
|
434 |
+
story_cloze_2016,5,Answer Given options,acc,0.46178514163548906
|
435 |
+
story_cloze_2016,5,Choose Story Ending,acc,0.4826296098343132
|
436 |
+
story_cloze_2016,5,Novel Correct Ending,acc,0.4719401389631213
|
437 |
+
story_cloze_2016,5,Story Continuation and Options,acc,0.49438802779262425
|
438 |
+
story_cloze_2016,5,median,accuracy,0.47728487439871725
|
439 |
+
story_cloze_2016,5,average,multiple,0.480669873507928
|
440 |
+
superglue_rte,0,GPT-3 style,acc,0.5090252707581228
|
441 |
+
superglue_rte,0,MNLI crowdsource,acc,0.48014440433212996
|
442 |
+
superglue_rte,0,does it follow that,acc,0.44404332129963897
|
443 |
+
superglue_rte,0,guaranteed true,acc,0.5126353790613718
|
444 |
+
superglue_rte,0,should assume,acc,0.5415162454873647
|
445 |
+
superglue_rte,0,median,accuracy,0.5090252707581228
|
446 |
+
superglue_rte,1,GPT-3 style,acc,0.5090252707581228
|
447 |
+
superglue_rte,1,MNLI crowdsource,acc,0.49097472924187724
|
448 |
+
superglue_rte,1,does it follow that,acc,0.49097472924187724
|
449 |
+
superglue_rte,1,guaranteed true,acc,0.49097472924187724
|
450 |
+
superglue_rte,1,should assume,acc,0.48375451263537905
|
451 |
+
superglue_rte,1,median,accuracy,0.49097472924187724
|
452 |
+
superglue_rte,2,GPT-3 style,acc,0.516245487364621
|
453 |
+
superglue_rte,2,MNLI crowdsource,acc,0.5054151624548736
|
454 |
+
superglue_rte,2,does it follow that,acc,0.516245487364621
|
455 |
+
superglue_rte,2,guaranteed true,acc,0.5018050541516246
|
456 |
+
superglue_rte,2,should assume,acc,0.516245487364621
|
457 |
+
superglue_rte,2,median,accuracy,0.516245487364621
|
458 |
+
superglue_rte,3,GPT-3 style,acc,0.5234657039711191
|
459 |
+
superglue_rte,3,MNLI crowdsource,acc,0.5270758122743683
|
460 |
+
superglue_rte,3,does it follow that,acc,0.5379061371841155
|
461 |
+
superglue_rte,3,guaranteed true,acc,0.51985559566787
|
462 |
+
superglue_rte,3,should assume,acc,0.5306859205776173
|
463 |
+
superglue_rte,3,median,accuracy,0.5270758122743683
|
464 |
+
superglue_rte,4,GPT-3 style,acc,0.5126353790613718
|
465 |
+
superglue_rte,4,MNLI crowdsource,acc,0.5379061371841155
|
466 |
+
superglue_rte,4,does it follow that,acc,0.51985559566787
|
467 |
+
superglue_rte,4,guaranteed true,acc,0.5342960288808665
|
468 |
+
superglue_rte,4,should assume,acc,0.5234657039711191
|
469 |
+
superglue_rte,4,median,accuracy,0.5234657039711191
|
470 |
+
superglue_rte,5,GPT-3 style,acc,0.5306859205776173
|
471 |
+
superglue_rte,5,MNLI crowdsource,acc,0.5090252707581228
|
472 |
+
superglue_rte,5,does it follow that,acc,0.5234657039711191
|
473 |
+
superglue_rte,5,guaranteed true,acc,0.5270758122743683
|
474 |
+
superglue_rte,5,should assume,acc,0.5306859205776173
|
475 |
+
superglue_rte,5,median,accuracy,0.5270758122743683
|
476 |
+
superglue_rte,5,average,multiple,0.5156438026474128
|
477 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.049917192299013896
|
478 |
+
web_nlg_en,0,explicit-graph-description2,rouge2_fmeasure,0.00537375341399136
|
479 |
+
web_nlg_en,0,implicit-graph-description,rouge2_fmeasure,0.0034599979122394713
|
480 |
+
web_nlg_en,0,non-explicit-description,rouge2_fmeasure,0.0007679014767691473
|
481 |
+
web_nlg_en,0,very-explicit-description,rouge2_fmeasure,0.0005675109134810492
|
482 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.0034599979122394713
|
483 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05553061893758205
|
484 |
+
web_nlg_en,1,explicit-graph-description2,rouge2_fmeasure,0.1493719195270224
|
485 |
+
web_nlg_en,1,implicit-graph-description,rouge2_fmeasure,0.06073259012334097
|
486 |
+
web_nlg_en,1,non-explicit-description,rouge2_fmeasure,0.12187611759524282
|
487 |
+
web_nlg_en,1,very-explicit-description,rouge2_fmeasure,0.058019313464160664
|
488 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.06073259012334097
|
489 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.057331612844470456
|
490 |
+
web_nlg_en,2,explicit-graph-description2,rouge2_fmeasure,0.29741471013327875
|
491 |
+
web_nlg_en,2,implicit-graph-description,rouge2_fmeasure,0.10509886424101751
|
492 |
+
web_nlg_en,2,non-explicit-description,rouge2_fmeasure,0.13915045372935977
|
493 |
+
web_nlg_en,2,very-explicit-description,rouge2_fmeasure,0.18536983565498788
|
494 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.13915045372935977
|
495 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05836966723015618
|
496 |
+
web_nlg_en,3,explicit-graph-description2,rouge2_fmeasure,0.31908027874923556
|
497 |
+
web_nlg_en,3,implicit-graph-description,rouge2_fmeasure,0.12855310203356
|
498 |
+
web_nlg_en,3,non-explicit-description,rouge2_fmeasure,0.13862295274121508
|
499 |
+
web_nlg_en,3,very-explicit-description,rouge2_fmeasure,0.26767500932744387
|
500 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.13862295274121508
|
501 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.0577700863367864
|
502 |
+
web_nlg_en,4,explicit-graph-description2,rouge2_fmeasure,0.32559945531385337
|
503 |
+
web_nlg_en,4,implicit-graph-description,rouge2_fmeasure,0.13327902883600057
|
504 |
+
web_nlg_en,4,non-explicit-description,rouge2_fmeasure,0.13292917476889163
|
505 |
+
web_nlg_en,4,very-explicit-description,rouge2_fmeasure,0.24713173271908043
|
506 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.13327902883600057
|
507 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05951196634046783
|
508 |
+
web_nlg_en,5,explicit-graph-description2,rouge2_fmeasure,0.3330531711847648
|
509 |
+
web_nlg_en,5,implicit-graph-description,rouge2_fmeasure,0.13440771646832117
|
510 |
+
web_nlg_en,5,non-explicit-description,rouge2_fmeasure,0.1292376475089326
|
511 |
+
web_nlg_en,5,very-explicit-description,rouge2_fmeasure,0.2256483945335245
|
512 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.13440771646832117
|
513 |
+
web_nlg_en,5,average,multiple,0.10160878996841284
|
514 |
+
wiki_lingua_en,0,article_summary_en,rouge2_fmeasure,0.04599914076874335
|
515 |
+
wiki_lingua_en,0,rephrase_en,rouge2_fmeasure,0.014876069868397498
|
516 |
+
wiki_lingua_en,0,summarize_above_en,rouge2_fmeasure,0.01758898085911187
|
517 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03601951697280678
|
518 |
+
wiki_lingua_en,0,write_abstract_en,rouge2_fmeasure,0.015984403876231276
|
519 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.01758898085911187
|
520 |
+
wiki_lingua_en,1,article_summary_en,rouge2_fmeasure,0.040864899057913275
|
521 |
+
wiki_lingua_en,1,rephrase_en,rouge2_fmeasure,0.02017226312757468
|
522 |
+
wiki_lingua_en,1,summarize_above_en,rouge2_fmeasure,0.025193685294665726
|
523 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04757609861819433
|
524 |
+
wiki_lingua_en,1,write_abstract_en,rouge2_fmeasure,0.020818331143036432
|
525 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.025193685294665726
|
526 |
+
wiki_lingua_en,2,article_summary_en,rouge2_fmeasure,0.04643912783550571
|
527 |
+
wiki_lingua_en,2,rephrase_en,rouge2_fmeasure,0.040268213401871214
|
528 |
+
wiki_lingua_en,2,summarize_above_en,rouge2_fmeasure,0.040241799290576724
|
529 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05650249608530642
|
530 |
+
wiki_lingua_en,2,write_abstract_en,rouge2_fmeasure,0.0219486272286028
|
531 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.040268213401871214
|
532 |
+
wiki_lingua_en,3,article_summary_en,rouge2_fmeasure,0.0414745893981831
|
533 |
+
wiki_lingua_en,3,rephrase_en,rouge2_fmeasure,0.03686577574547315
|
534 |
+
wiki_lingua_en,3,summarize_above_en,rouge2_fmeasure,0.03722078217429424
|
535 |
+
wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05005886014366939
|
536 |
+
wiki_lingua_en,3,write_abstract_en,rouge2_fmeasure,0.017248853072901214
|
537 |
+
wiki_lingua_en,3,median,rouge2_fmeasure,0.03722078217429424
|
538 |
+
wiki_lingua_en,4,article_summary_en,rouge2_fmeasure,0.013892451555974773
|
539 |
+
wiki_lingua_en,4,rephrase_en,rouge2_fmeasure,0.012358132206090394
|
540 |
+
wiki_lingua_en,4,summarize_above_en,rouge2_fmeasure,0.011579301131776956
|
541 |
+
wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.015594437236270214
|
542 |
+
wiki_lingua_en,4,write_abstract_en,rouge2_fmeasure,0.0040223276062826534
|
543 |
+
wiki_lingua_en,4,median,rouge2_fmeasure,0.012358132206090394
|
544 |
+
wiki_lingua_en,5,article_summary_en,rouge2_fmeasure,0.002181790528509318
|
545 |
+
wiki_lingua_en,5,rephrase_en,rouge2_fmeasure,0.0017789109729195612
|
546 |
+
wiki_lingua_en,5,summarize_above_en,rouge2_fmeasure,0.0015940878813423497
|
547 |
+
wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0024833328621297794
|
548 |
+
wiki_lingua_en,5,write_abstract_en,rouge2_fmeasure,0.0003338217989499952
|
549 |
+
wiki_lingua_en,5,median,rouge2_fmeasure,0.0017789109729195612
|
550 |
+
wiki_lingua_en,5,average,multiple,0.022401450818158836
|
551 |
+
winogrande,0,Replace,acc,0.49013417521704816
|
552 |
+
winogrande,0,True or False,acc,0.4956590370955012
|
553 |
+
winogrande,0,does underscore refer to,acc,0.4940805051302289
|
554 |
+
winogrande,0,stand for,acc,0.500394632991318
|
555 |
+
winogrande,0,underscore refer to,acc,0.4861878453038674
|
556 |
+
winogrande,0,median,accuracy,0.4940805051302289
|
557 |
+
winogrande,1,Replace,acc,0.5098658247829518
|
558 |
+
winogrande,1,True or False,acc,0.494869771112865
|
559 |
+
winogrande,1,does underscore refer to,acc,0.505130228887135
|
560 |
+
winogrande,1,stand for,acc,0.5209155485398579
|
561 |
+
winogrande,1,underscore refer to,acc,0.5122336227308603
|
562 |
+
winogrande,1,median,accuracy,0.5098658247829518
|
563 |
+
winogrande,2,Replace,acc,0.5177584846093133
|
564 |
+
winogrande,2,True or False,acc,0.4956590370955012
|
565 |
+
winogrande,2,does underscore refer to,acc,0.5303867403314917
|
566 |
+
winogrande,2,stand for,acc,0.5240726124704025
|
567 |
+
winogrande,2,underscore refer to,acc,0.5146014206787688
|
568 |
+
winogrande,2,median,accuracy,0.5177584846093133
|
569 |
+
winogrande,3,Replace,acc,0.5240726124704025
|
570 |
+
winogrande,3,True or False,acc,0.4988161010260458
|
571 |
+
winogrande,3,does underscore refer to,acc,0.5272296764009471
|
572 |
+
winogrande,3,stand for,acc,0.510655090765588
|
573 |
+
winogrande,3,underscore refer to,acc,0.5248618784530387
|
574 |
+
winogrande,3,median,accuracy,0.5240726124704025
|
575 |
+
winogrande,4,Replace,acc,0.5177584846093133
|
576 |
+
winogrande,4,True or False,acc,0.5027624309392266
|
577 |
+
winogrande,4,does underscore refer to,acc,0.5288082083662194
|
578 |
+
winogrande,4,stand for,acc,0.5067087608524072
|
579 |
+
winogrande,4,underscore refer to,acc,0.5232833464877664
|
580 |
+
winogrande,4,median,accuracy,0.5177584846093133
|
581 |
+
winogrande,5,Replace,acc,0.5185477505919495
|
582 |
+
winogrande,5,True or False,acc,0.4940805051302289
|
583 |
+
winogrande,5,does underscore refer to,acc,0.526440410418311
|
584 |
+
winogrande,5,stand for,acc,0.4972375690607735
|
585 |
+
winogrande,5,underscore refer to,acc,0.5224940805051302
|
586 |
+
winogrande,5,median,accuracy,0.5185477505919495
|
587 |
+
winogrande,5,average,multiple,0.5136806103656932
|
4b284b28bc4/eval/merged.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
4b284b42bc4/eval/merged.csv
ADDED
@@ -0,0 +1,587 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset,fewshots,prompt,metric,value
|
2 |
+
anli_r1,0,GPT-3 style,acc,0.323
|
3 |
+
anli_r1,0,MNLI crowdsource,acc,0.334
|
4 |
+
anli_r1,0,can we infer,acc,0.342
|
5 |
+
anli_r1,0,guaranteed/possible/impossible,acc,0.334
|
6 |
+
anli_r1,0,justified in saying,acc,0.342
|
7 |
+
anli_r1,0,median,accuracy,0.334
|
8 |
+
anli_r1,1,GPT-3 style,acc,0.324
|
9 |
+
anli_r1,1,MNLI crowdsource,acc,0.333
|
10 |
+
anli_r1,1,can we infer,acc,0.333
|
11 |
+
anli_r1,1,guaranteed/possible/impossible,acc,0.331
|
12 |
+
anli_r1,1,justified in saying,acc,0.333
|
13 |
+
anli_r1,1,median,accuracy,0.333
|
14 |
+
anli_r1,2,GPT-3 style,acc,0.346
|
15 |
+
anli_r1,2,MNLI crowdsource,acc,0.359
|
16 |
+
anli_r1,2,can we infer,acc,0.341
|
17 |
+
anli_r1,2,guaranteed/possible/impossible,acc,0.331
|
18 |
+
anli_r1,2,justified in saying,acc,0.337
|
19 |
+
anli_r1,2,median,accuracy,0.341
|
20 |
+
anli_r1,3,GPT-3 style,acc,0.348
|
21 |
+
anli_r1,3,MNLI crowdsource,acc,0.356
|
22 |
+
anli_r1,3,can we infer,acc,0.366
|
23 |
+
anli_r1,3,guaranteed/possible/impossible,acc,0.334
|
24 |
+
anli_r1,3,justified in saying,acc,0.356
|
25 |
+
anli_r1,3,median,accuracy,0.356
|
26 |
+
anli_r1,4,GPT-3 style,acc,0.325
|
27 |
+
anli_r1,4,MNLI crowdsource,acc,0.345
|
28 |
+
anli_r1,4,can we infer,acc,0.335
|
29 |
+
anli_r1,4,guaranteed/possible/impossible,acc,0.341
|
30 |
+
anli_r1,4,justified in saying,acc,0.343
|
31 |
+
anli_r1,4,median,accuracy,0.341
|
32 |
+
anli_r1,5,GPT-3 style,acc,0.314
|
33 |
+
anli_r1,5,MNLI crowdsource,acc,0.353
|
34 |
+
anli_r1,5,can we infer,acc,0.328
|
35 |
+
anli_r1,5,guaranteed/possible/impossible,acc,0.327
|
36 |
+
anli_r1,5,justified in saying,acc,0.326
|
37 |
+
anli_r1,5,median,accuracy,0.327
|
38 |
+
anli_r1,5,average,multiple,0.33866666666666667
|
39 |
+
anli_r2,0,GPT-3 style,acc,0.327
|
40 |
+
anli_r2,0,MNLI crowdsource,acc,0.334
|
41 |
+
anli_r2,0,can we infer,acc,0.348
|
42 |
+
anli_r2,0,guaranteed/possible/impossible,acc,0.336
|
43 |
+
anli_r2,0,justified in saying,acc,0.34
|
44 |
+
anli_r2,0,median,accuracy,0.336
|
45 |
+
anli_r2,1,GPT-3 style,acc,0.309
|
46 |
+
anli_r2,1,MNLI crowdsource,acc,0.315
|
47 |
+
anli_r2,1,can we infer,acc,0.315
|
48 |
+
anli_r2,1,guaranteed/possible/impossible,acc,0.308
|
49 |
+
anli_r2,1,justified in saying,acc,0.315
|
50 |
+
anli_r2,1,median,accuracy,0.315
|
51 |
+
anli_r2,2,GPT-3 style,acc,0.317
|
52 |
+
anli_r2,2,MNLI crowdsource,acc,0.312
|
53 |
+
anli_r2,2,can we infer,acc,0.316
|
54 |
+
anli_r2,2,guaranteed/possible/impossible,acc,0.324
|
55 |
+
anli_r2,2,justified in saying,acc,0.322
|
56 |
+
anli_r2,2,median,accuracy,0.317
|
57 |
+
anli_r2,3,GPT-3 style,acc,0.333
|
58 |
+
anli_r2,3,MNLI crowdsource,acc,0.305
|
59 |
+
anli_r2,3,can we infer,acc,0.32
|
60 |
+
anli_r2,3,guaranteed/possible/impossible,acc,0.33
|
61 |
+
anli_r2,3,justified in saying,acc,0.315
|
62 |
+
anli_r2,3,median,accuracy,0.32
|
63 |
+
anli_r2,4,GPT-3 style,acc,0.323
|
64 |
+
anli_r2,4,MNLI crowdsource,acc,0.306
|
65 |
+
anli_r2,4,can we infer,acc,0.308
|
66 |
+
anli_r2,4,guaranteed/possible/impossible,acc,0.311
|
67 |
+
anli_r2,4,justified in saying,acc,0.306
|
68 |
+
anli_r2,4,median,accuracy,0.308
|
69 |
+
anli_r2,5,GPT-3 style,acc,0.327
|
70 |
+
anli_r2,5,MNLI crowdsource,acc,0.315
|
71 |
+
anli_r2,5,can we infer,acc,0.326
|
72 |
+
anli_r2,5,guaranteed/possible/impossible,acc,0.319
|
73 |
+
anli_r2,5,justified in saying,acc,0.319
|
74 |
+
anli_r2,5,median,accuracy,0.319
|
75 |
+
anli_r2,5,average,multiple,0.31916666666666665
|
76 |
+
anli_r3,0,GPT-3 style,acc,0.35083333333333333
|
77 |
+
anli_r3,0,MNLI crowdsource,acc,0.33416666666666667
|
78 |
+
anli_r3,0,can we infer,acc,0.3325
|
79 |
+
anli_r3,0,guaranteed/possible/impossible,acc,0.3275
|
80 |
+
anli_r3,0,justified in saying,acc,0.33916666666666667
|
81 |
+
anli_r3,0,median,accuracy,0.33416666666666667
|
82 |
+
anli_r3,1,GPT-3 style,acc,0.3441666666666667
|
83 |
+
anli_r3,1,MNLI crowdsource,acc,0.33666666666666667
|
84 |
+
anli_r3,1,can we infer,acc,0.33666666666666667
|
85 |
+
anli_r3,1,guaranteed/possible/impossible,acc,0.3283333333333333
|
86 |
+
anli_r3,1,justified in saying,acc,0.33666666666666667
|
87 |
+
anli_r3,1,median,accuracy,0.33666666666666667
|
88 |
+
anli_r3,2,GPT-3 style,acc,0.3275
|
89 |
+
anli_r3,2,MNLI crowdsource,acc,0.31916666666666665
|
90 |
+
anli_r3,2,can we infer,acc,0.32
|
91 |
+
anli_r3,2,guaranteed/possible/impossible,acc,0.3125
|
92 |
+
anli_r3,2,justified in saying,acc,0.3275
|
93 |
+
anli_r3,2,median,accuracy,0.32
|
94 |
+
anli_r3,3,GPT-3 style,acc,0.33666666666666667
|
95 |
+
anli_r3,3,MNLI crowdsource,acc,0.32916666666666666
|
96 |
+
anli_r3,3,can we infer,acc,0.335
|
97 |
+
anli_r3,3,guaranteed/possible/impossible,acc,0.3258333333333333
|
98 |
+
anli_r3,3,justified in saying,acc,0.3383333333333333
|
99 |
+
anli_r3,3,median,accuracy,0.335
|
100 |
+
anli_r3,4,GPT-3 style,acc,0.30666666666666664
|
101 |
+
anli_r3,4,MNLI crowdsource,acc,0.3275
|
102 |
+
anli_r3,4,can we infer,acc,0.3233333333333333
|
103 |
+
anli_r3,4,guaranteed/possible/impossible,acc,0.31583333333333335
|
104 |
+
anli_r3,4,justified in saying,acc,0.32166666666666666
|
105 |
+
anli_r3,4,median,accuracy,0.32166666666666666
|
106 |
+
anli_r3,5,GPT-3 style,acc,0.31166666666666665
|
107 |
+
anli_r3,5,MNLI crowdsource,acc,0.30833333333333335
|
108 |
+
anli_r3,5,can we infer,acc,0.32666666666666666
|
109 |
+
anli_r3,5,guaranteed/possible/impossible,acc,0.31416666666666665
|
110 |
+
anli_r3,5,justified in saying,acc,0.3233333333333333
|
111 |
+
anli_r3,5,median,accuracy,0.31416666666666665
|
112 |
+
anli_r3,5,average,multiple,0.3269444444444444
|
113 |
+
arc_easy,0,heres_a_problem,acc,0.24494949494949494
|
114 |
+
arc_easy,0,i_am_hesitating,acc,0.25170648464163825
|
115 |
+
arc_easy,0,multiple_choice,acc,0.24488054607508533
|
116 |
+
arc_easy,0,pick_the_most_correct_option,acc,0.23947811447811448
|
117 |
+
arc_easy,0,qa_options,acc,0.2619453924914676
|
118 |
+
arc_easy,0,median,accuracy,0.24494949494949494
|
119 |
+
arc_easy,1,heres_a_problem,acc,0.2380546075085324
|
120 |
+
arc_easy,1,i_am_hesitating,acc,0.3560606060606061
|
121 |
+
arc_easy,1,multiple_choice,acc,0.24914675767918087
|
122 |
+
arc_easy,1,pick_the_most_correct_option,acc,0.23526936026936027
|
123 |
+
arc_easy,1,qa_options,acc,0.26791808873720135
|
124 |
+
arc_easy,1,median,accuracy,0.24914675767918087
|
125 |
+
arc_easy,2,heres_a_problem,acc,0.242003367003367
|
126 |
+
arc_easy,2,i_am_hesitating,acc,0.3480639730639731
|
127 |
+
arc_easy,2,multiple_choice,acc,0.35353535353535354
|
128 |
+
arc_easy,2,pick_the_most_correct_option,acc,0.2431740614334471
|
129 |
+
arc_easy,2,qa_options,acc,0.34553872053872053
|
130 |
+
arc_easy,2,median,accuracy,0.34553872053872053
|
131 |
+
arc_easy,3,heres_a_problem,acc,0.2478956228956229
|
132 |
+
arc_easy,3,i_am_hesitating,acc,0.26535836177474403
|
133 |
+
arc_easy,3,multiple_choice,acc,0.34553872053872053
|
134 |
+
arc_easy,3,pick_the_most_correct_option,acc,0.2474747474747475
|
135 |
+
arc_easy,3,qa_options,acc,0.257679180887372
|
136 |
+
arc_easy,3,median,accuracy,0.257679180887372
|
137 |
+
arc_easy,4,heres_a_problem,acc,0.25341296928327645
|
138 |
+
arc_easy,4,i_am_hesitating,acc,0.2593856655290102
|
139 |
+
arc_easy,4,multiple_choice,acc,0.2525597269624573
|
140 |
+
arc_easy,4,pick_the_most_correct_option,acc,0.2551194539249147
|
141 |
+
arc_easy,4,qa_options,acc,0.3341750841750842
|
142 |
+
arc_easy,4,median,accuracy,0.2551194539249147
|
143 |
+
arc_easy,5,heres_a_problem,acc,0.2354948805460751
|
144 |
+
arc_easy,5,i_am_hesitating,acc,0.32365319865319864
|
145 |
+
arc_easy,5,multiple_choice,acc,0.2508532423208191
|
146 |
+
arc_easy,5,pick_the_most_correct_option,acc,0.25252525252525254
|
147 |
+
arc_easy,5,qa_options,acc,0.3261784511784512
|
148 |
+
arc_easy,5,median,accuracy,0.25252525252525254
|
149 |
+
arc_easy,5,average,multiple,0.26749314341748925
|
150 |
+
boolq,0,GPT-3 Style,acc,0.538
|
151 |
+
boolq,0,after_reading,acc,0.6233333333333333
|
152 |
+
boolq,0,exercise,acc,0.623
|
153 |
+
boolq,0,valid_binary,acc,0.5896666666666667
|
154 |
+
boolq,0,yes_no_question,acc,0.5293333333333333
|
155 |
+
boolq,0,median,accuracy,0.5896666666666667
|
156 |
+
boolq,1,GPT-3 Style,acc,0.5356666666666666
|
157 |
+
boolq,1,after_reading,acc,0.5406666666666666
|
158 |
+
boolq,1,exercise,acc,0.5566666666666666
|
159 |
+
boolq,1,valid_binary,acc,0.5423333333333333
|
160 |
+
boolq,1,yes_no_question,acc,0.5406666666666666
|
161 |
+
boolq,1,median,accuracy,0.5406666666666666
|
162 |
+
boolq,2,GPT-3 Style,acc,0.5443333333333333
|
163 |
+
boolq,2,after_reading,acc,0.5396666666666666
|
164 |
+
boolq,2,exercise,acc,0.5536666666666666
|
165 |
+
boolq,2,valid_binary,acc,0.5706666666666667
|
166 |
+
boolq,2,yes_no_question,acc,0.48233333333333334
|
167 |
+
boolq,2,median,accuracy,0.5443333333333333
|
168 |
+
boolq,3,GPT-3 Style,acc,0.5566666666666666
|
169 |
+
boolq,3,after_reading,acc,0.539
|
170 |
+
boolq,3,exercise,acc,0.5583333333333333
|
171 |
+
boolq,3,valid_binary,acc,0.5633333333333334
|
172 |
+
boolq,3,yes_no_question,acc,0.4676666666666667
|
173 |
+
boolq,3,median,accuracy,0.5566666666666666
|
174 |
+
boolq,4,GPT-3 Style,acc,0.5656666666666667
|
175 |
+
boolq,4,after_reading,acc,0.527
|
176 |
+
boolq,4,exercise,acc,0.57
|
177 |
+
boolq,4,valid_binary,acc,0.5543333333333333
|
178 |
+
boolq,4,yes_no_question,acc,0.481
|
179 |
+
boolq,4,median,accuracy,0.5543333333333333
|
180 |
+
boolq,5,GPT-3 Style,acc,0.5716666666666667
|
181 |
+
boolq,5,after_reading,acc,0.5133333333333333
|
182 |
+
boolq,5,exercise,acc,0.567
|
183 |
+
boolq,5,valid_binary,acc,0.561
|
184 |
+
boolq,5,yes_no_question,acc,0.47733333333333333
|
185 |
+
boolq,5,median,accuracy,0.561
|
186 |
+
boolq,5,average,multiple,0.5577777777777778
|
187 |
+
cb,0,GPT-3 style,acc,0.35714285714285715
|
188 |
+
cb,0,MNLI crowdsource,acc,0.4107142857142857
|
189 |
+
cb,0,can we infer,acc,0.4642857142857143
|
190 |
+
cb,0,guaranteed/possible/impossible,acc,0.14285714285714285
|
191 |
+
cb,0,justified in saying,acc,0.35714285714285715
|
192 |
+
cb,0,median,accuracy,0.35714285714285715
|
193 |
+
cb,1,GPT-3 style,acc,0.39285714285714285
|
194 |
+
cb,1,MNLI crowdsource,acc,0.39285714285714285
|
195 |
+
cb,1,can we infer,acc,0.39285714285714285
|
196 |
+
cb,1,guaranteed/possible/impossible,acc,0.375
|
197 |
+
cb,1,justified in saying,acc,0.39285714285714285
|
198 |
+
cb,1,median,accuracy,0.39285714285714285
|
199 |
+
cb,2,GPT-3 style,acc,0.375
|
200 |
+
cb,2,MNLI crowdsource,acc,0.4642857142857143
|
201 |
+
cb,2,can we infer,acc,0.39285714285714285
|
202 |
+
cb,2,guaranteed/possible/impossible,acc,0.375
|
203 |
+
cb,2,justified in saying,acc,0.4107142857142857
|
204 |
+
cb,2,median,accuracy,0.39285714285714285
|
205 |
+
cb,3,GPT-3 style,acc,0.35714285714285715
|
206 |
+
cb,3,MNLI crowdsource,acc,0.5357142857142857
|
207 |
+
cb,3,can we infer,acc,0.44642857142857145
|
208 |
+
cb,3,guaranteed/possible/impossible,acc,0.35714285714285715
|
209 |
+
cb,3,justified in saying,acc,0.44642857142857145
|
210 |
+
cb,3,median,accuracy,0.44642857142857145
|
211 |
+
cb,4,GPT-3 style,acc,0.3392857142857143
|
212 |
+
cb,4,MNLI crowdsource,acc,0.4642857142857143
|
213 |
+
cb,4,can we infer,acc,0.5357142857142857
|
214 |
+
cb,4,guaranteed/possible/impossible,acc,0.42857142857142855
|
215 |
+
cb,4,justified in saying,acc,0.44642857142857145
|
216 |
+
cb,4,median,accuracy,0.44642857142857145
|
217 |
+
cb,5,GPT-3 style,acc,0.30357142857142855
|
218 |
+
cb,5,MNLI crowdsource,acc,0.5
|
219 |
+
cb,5,can we infer,acc,0.5
|
220 |
+
cb,5,guaranteed/possible/impossible,acc,0.35714285714285715
|
221 |
+
cb,5,justified in saying,acc,0.42857142857142855
|
222 |
+
cb,5,median,accuracy,0.42857142857142855
|
223 |
+
cb,5,average,multiple,0.4107142857142857
|
224 |
+
copa,0,best_option,acc,0.54
|
225 |
+
copa,0,cause_effect,acc,0.6
|
226 |
+
copa,0,choose,acc,0.61
|
227 |
+
copa,0,i_am_hesitating,acc,0.62
|
228 |
+
copa,0,plausible_alternatives,acc,0.61
|
229 |
+
copa,0,median,accuracy,0.61
|
230 |
+
copa,1,best_option,acc,0.57
|
231 |
+
copa,1,cause_effect,acc,0.47
|
232 |
+
copa,1,choose,acc,0.47
|
233 |
+
copa,1,i_am_hesitating,acc,0.49
|
234 |
+
copa,1,plausible_alternatives,acc,0.42
|
235 |
+
copa,1,median,accuracy,0.47
|
236 |
+
copa,2,best_option,acc,0.57
|
237 |
+
copa,2,cause_effect,acc,0.44
|
238 |
+
copa,2,choose,acc,0.48
|
239 |
+
copa,2,i_am_hesitating,acc,0.45
|
240 |
+
copa,2,plausible_alternatives,acc,0.43
|
241 |
+
copa,2,median,accuracy,0.45
|
242 |
+
copa,3,best_option,acc,0.57
|
243 |
+
copa,3,cause_effect,acc,0.48
|
244 |
+
copa,3,choose,acc,0.49
|
245 |
+
copa,3,i_am_hesitating,acc,0.44
|
246 |
+
copa,3,plausible_alternatives,acc,0.43
|
247 |
+
copa,3,median,accuracy,0.48
|
248 |
+
copa,4,best_option,acc,0.59
|
249 |
+
copa,4,cause_effect,acc,0.48
|
250 |
+
copa,4,choose,acc,0.47
|
251 |
+
copa,4,i_am_hesitating,acc,0.48
|
252 |
+
copa,4,plausible_alternatives,acc,0.45
|
253 |
+
copa,4,median,accuracy,0.48
|
254 |
+
copa,5,best_option,acc,0.55
|
255 |
+
copa,5,cause_effect,acc,0.45
|
256 |
+
copa,5,choose,acc,0.45
|
257 |
+
copa,5,i_am_hesitating,acc,0.44
|
258 |
+
copa,5,plausible_alternatives,acc,0.45
|
259 |
+
copa,5,median,accuracy,0.45
|
260 |
+
copa,5,average,multiple,0.49
|
261 |
+
e2e_nlg_cleaned,0,coherent_text,rouge2_fmeasure,0.0768086035487303
|
262 |
+
e2e_nlg_cleaned,0,create_text_for_me,rouge2_fmeasure,0.02963754911980315
|
263 |
+
e2e_nlg_cleaned,0,generate_gramatically_correct_text,rouge2_fmeasure,0.0
|
264 |
+
e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0063826724183375155
|
265 |
+
e2e_nlg_cleaned,0,text,rouge2_fmeasure,0.16205539672246214
|
266 |
+
e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.02963754911980315
|
267 |
+
e2e_nlg_cleaned,1,coherent_text,rouge2_fmeasure,0.1695476123074274
|
268 |
+
e2e_nlg_cleaned,1,create_text_for_me,rouge2_fmeasure,0.16266540713305758
|
269 |
+
e2e_nlg_cleaned,1,generate_gramatically_correct_text,rouge2_fmeasure,0.06654255399675307
|
270 |
+
e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.2056915755809246
|
271 |
+
e2e_nlg_cleaned,1,text,rouge2_fmeasure,0.20190477038644114
|
272 |
+
e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1695476123074274
|
273 |
+
e2e_nlg_cleaned,2,coherent_text,rouge2_fmeasure,0.18139573177296225
|
274 |
+
e2e_nlg_cleaned,2,create_text_for_me,rouge2_fmeasure,0.174575698484694
|
275 |
+
e2e_nlg_cleaned,2,generate_gramatically_correct_text,rouge2_fmeasure,0.10730240032895362
|
276 |
+
e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.22591032128288588
|
277 |
+
e2e_nlg_cleaned,2,text,rouge2_fmeasure,0.20141804290762905
|
278 |
+
e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.18139573177296225
|
279 |
+
e2e_nlg_cleaned,3,coherent_text,rouge2_fmeasure,0.18499822465619842
|
280 |
+
e2e_nlg_cleaned,3,create_text_for_me,rouge2_fmeasure,0.17547831360286914
|
281 |
+
e2e_nlg_cleaned,3,generate_gramatically_correct_text,rouge2_fmeasure,0.12907057263921567
|
282 |
+
e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.23547797340215765
|
283 |
+
e2e_nlg_cleaned,3,text,rouge2_fmeasure,0.19894653788829594
|
284 |
+
e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.18499822465619842
|
285 |
+
e2e_nlg_cleaned,4,coherent_text,rouge2_fmeasure,0.18449634822804548
|
286 |
+
e2e_nlg_cleaned,4,create_text_for_me,rouge2_fmeasure,0.1739020471143616
|
287 |
+
e2e_nlg_cleaned,4,generate_gramatically_correct_text,rouge2_fmeasure,0.14413942196532828
|
288 |
+
e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.23765394178309218
|
289 |
+
e2e_nlg_cleaned,4,text,rouge2_fmeasure,0.19618559845779804
|
290 |
+
e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.18449634822804548
|
291 |
+
e2e_nlg_cleaned,5,coherent_text,rouge2_fmeasure,0.180205137590262
|
292 |
+
e2e_nlg_cleaned,5,create_text_for_me,rouge2_fmeasure,0.17145057785135936
|
293 |
+
e2e_nlg_cleaned,5,generate_gramatically_correct_text,rouge2_fmeasure,0.15257686050529207
|
294 |
+
e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2366049201616526
|
295 |
+
e2e_nlg_cleaned,5,text,rouge2_fmeasure,0.19065995281551984
|
296 |
+
e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.180205137590262
|
297 |
+
e2e_nlg_cleaned,5,average,multiple,0.15504676727911645
|
298 |
+
gem_xsum,0,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.021410321262745457
|
299 |
+
gem_xsum,0,DOC_tldr,rouge2_fmeasure,0.05123793702073981
|
300 |
+
gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04790575968435739
|
301 |
+
gem_xsum,0,summarize_DOC,rouge2_fmeasure,0.041608603555378855
|
302 |
+
gem_xsum,0,summarize_this_DOC_summary,rouge2_fmeasure,0.04915332059409096
|
303 |
+
gem_xsum,0,median,rouge2_fmeasure,0.04790575968435739
|
304 |
+
gem_xsum,1,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0187210253001416
|
305 |
+
gem_xsum,1,DOC_tldr,rouge2_fmeasure,0.050711870640867504
|
306 |
+
gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04413377405232099
|
307 |
+
gem_xsum,1,summarize_DOC,rouge2_fmeasure,0.04729852367510724
|
308 |
+
gem_xsum,1,summarize_this_DOC_summary,rouge2_fmeasure,0.038642466121148654
|
309 |
+
gem_xsum,1,median,rouge2_fmeasure,0.04413377405232099
|
310 |
+
gem_xsum,2,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.03174503623834507
|
311 |
+
gem_xsum,2,DOC_tldr,rouge2_fmeasure,0.05192590462289213
|
312 |
+
gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.046170973346933354
|
313 |
+
gem_xsum,2,summarize_DOC,rouge2_fmeasure,0.04973150539674991
|
314 |
+
gem_xsum,2,summarize_this_DOC_summary,rouge2_fmeasure,0.04211259064384283
|
315 |
+
gem_xsum,2,median,rouge2_fmeasure,0.046170973346933354
|
316 |
+
gem_xsum,3,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.03592964212545872
|
317 |
+
gem_xsum,3,DOC_tldr,rouge2_fmeasure,0.0504745368841509
|
318 |
+
gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04715930396420784
|
319 |
+
gem_xsum,3,summarize_DOC,rouge2_fmeasure,0.04868798327580798
|
320 |
+
gem_xsum,3,summarize_this_DOC_summary,rouge2_fmeasure,0.0405702032936913
|
321 |
+
gem_xsum,3,median,rouge2_fmeasure,0.04715930396420784
|
322 |
+
gem_xsum,4,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.009837365340636202
|
323 |
+
gem_xsum,4,DOC_tldr,rouge2_fmeasure,0.013418856497160158
|
324 |
+
gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.012335864733508397
|
325 |
+
gem_xsum,4,summarize_DOC,rouge2_fmeasure,0.012561323055777309
|
326 |
+
gem_xsum,4,summarize_this_DOC_summary,rouge2_fmeasure,0.010936982876670076
|
327 |
+
gem_xsum,4,median,rouge2_fmeasure,0.012335864733508397
|
328 |
+
gem_xsum,5,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0
|
329 |
+
gem_xsum,5,DOC_tldr,rouge2_fmeasure,9.617082045566528e-05
|
330 |
+
gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0004337191943913522
|
331 |
+
gem_xsum,5,summarize_DOC,rouge2_fmeasure,0.00025865120204742847
|
332 |
+
gem_xsum,5,summarize_this_DOC_summary,rouge2_fmeasure,0.00011435105774728416
|
333 |
+
gem_xsum,5,median,rouge2_fmeasure,0.00011435105774728416
|
334 |
+
gem_xsum,5,average,multiple,0.03297000447317921
|
335 |
+
piqa,0,Correct the solution,rouge2_fmeasure,0.20120770859481335
|
336 |
+
piqa,0,choose the most appropriate solution,acc,0.48639825897714906
|
337 |
+
piqa,0,no prompt needed,rouge2_fmeasure,0.00581609356366873
|
338 |
+
piqa,0,pick_correct_choice_index,acc,0.49510337323177367
|
339 |
+
piqa,0,what_is_the_correct_ending,acc,0.5663764961915125
|
340 |
+
piqa,0,median,accuracy,0.49510337323177367
|
341 |
+
piqa,1,Correct the solution,rouge2_fmeasure,0.33159324549421026
|
342 |
+
piqa,1,choose the most appropriate solution,acc,0.5032644178454843
|
343 |
+
piqa,1,no prompt needed,rouge2_fmeasure,0.005210530351647924
|
344 |
+
piqa,1,pick_correct_choice_index,acc,0.49347116430903154
|
345 |
+
piqa,1,what_is_the_correct_ending,acc,0.573993471164309
|
346 |
+
piqa,1,median,accuracy,0.5032644178454843
|
347 |
+
piqa,2,Correct the solution,rouge2_fmeasure,0.38914197156300356
|
348 |
+
piqa,2,choose the most appropriate solution,acc,0.5103373231773667
|
349 |
+
piqa,2,no prompt needed,rouge2_fmeasure,0.004949579293108938
|
350 |
+
piqa,2,pick_correct_choice_index,acc,0.4885745375408052
|
351 |
+
piqa,2,what_is_the_correct_ending,acc,0.5696409140369967
|
352 |
+
piqa,2,median,accuracy,0.5103373231773667
|
353 |
+
piqa,3,Correct the solution,rouge2_fmeasure,0.39761438363429064
|
354 |
+
piqa,3,choose the most appropriate solution,acc,0.5048966267682263
|
355 |
+
piqa,3,no prompt needed,rouge2_fmeasure,0.004937686316660083
|
356 |
+
piqa,3,pick_correct_choice_index,acc,0.5065288356909684
|
357 |
+
piqa,3,what_is_the_correct_ending,acc,0.5554951033732318
|
358 |
+
piqa,3,median,accuracy,0.5065288356909684
|
359 |
+
piqa,4,Correct the solution,rouge2_fmeasure,0.36764860730658894
|
360 |
+
piqa,4,choose the most appropriate solution,acc,0.5032644178454843
|
361 |
+
piqa,4,no prompt needed,rouge2_fmeasure,0.004548001185708453
|
362 |
+
piqa,4,pick_correct_choice_index,acc,0.5081610446137106
|
363 |
+
piqa,4,what_is_the_correct_ending,acc,0.5544069640914037
|
364 |
+
piqa,4,median,accuracy,0.5081610446137106
|
365 |
+
piqa,5,Correct the solution,rouge2_fmeasure,0.3454708512028626
|
366 |
+
piqa,5,choose the most appropriate solution,acc,0.5038084874863983
|
367 |
+
piqa,5,no prompt needed,rouge2_fmeasure,0.004669858309181997
|
368 |
+
piqa,5,pick_correct_choice_index,acc,0.49020674646354734
|
369 |
+
piqa,5,what_is_the_correct_ending,acc,0.5554951033732318
|
370 |
+
piqa,5,median,accuracy,0.5038084874863983
|
371 |
+
piqa,5,average,multiple,0.5045339136742837
|
372 |
+
sciq,0,Direct Question,acc,0.867
|
373 |
+
sciq,0,Direct Question (Closed Book),acc,0.639
|
374 |
+
sciq,0,Multiple Choice,acc,0.601
|
375 |
+
sciq,0,Multiple Choice (Closed Book),acc,0.5
|
376 |
+
sciq,0,Multiple Choice Question First,acc,0.625
|
377 |
+
sciq,0,median,accuracy,0.625
|
378 |
+
sciq,1,Direct Question,acc,0.892
|
379 |
+
sciq,1,Direct Question (Closed Book),acc,0.679
|
380 |
+
sciq,1,Multiple Choice,acc,0.507
|
381 |
+
sciq,1,Multiple Choice (Closed Book),acc,0.506
|
382 |
+
sciq,1,Multiple Choice Question First,acc,0.42
|
383 |
+
sciq,1,median,accuracy,0.507
|
384 |
+
sciq,2,Direct Question,acc,0.9
|
385 |
+
sciq,2,Direct Question (Closed Book),acc,0.702
|
386 |
+
sciq,2,Multiple Choice,acc,0.559
|
387 |
+
sciq,2,Multiple Choice (Closed Book),acc,0.539
|
388 |
+
sciq,2,Multiple Choice Question First,acc,0.477
|
389 |
+
sciq,2,median,accuracy,0.559
|
390 |
+
sciq,3,Direct Question,acc,0.909
|
391 |
+
sciq,3,Direct Question (Closed Book),acc,0.717
|
392 |
+
sciq,3,Multiple Choice,acc,0.607
|
393 |
+
sciq,3,Multiple Choice (Closed Book),acc,0.57
|
394 |
+
sciq,3,Multiple Choice Question First,acc,0.546
|
395 |
+
sciq,3,median,accuracy,0.607
|
396 |
+
sciq,4,Direct Question,acc,0.912
|
397 |
+
sciq,4,Direct Question (Closed Book),acc,0.716
|
398 |
+
sciq,4,Multiple Choice,acc,0.642
|
399 |
+
sciq,4,Multiple Choice (Closed Book),acc,0.565
|
400 |
+
sciq,4,Multiple Choice Question First,acc,0.574
|
401 |
+
sciq,4,median,accuracy,0.642
|
402 |
+
sciq,5,Direct Question,acc,0.918
|
403 |
+
sciq,5,Direct Question (Closed Book),acc,0.716
|
404 |
+
sciq,5,Multiple Choice,acc,0.643
|
405 |
+
sciq,5,Multiple Choice (Closed Book),acc,0.577
|
406 |
+
sciq,5,Multiple Choice Question First,acc,0.622
|
407 |
+
sciq,5,median,accuracy,0.643
|
408 |
+
sciq,5,average,multiple,0.5971666666666666
|
409 |
+
story_cloze_2016,0,Answer Given options,acc,0.4730090860502405
|
410 |
+
story_cloze_2016,0,Choose Story Ending,acc,0.4820951362907536
|
411 |
+
story_cloze_2016,0,Novel Correct Ending,acc,0.4820951362907536
|
412 |
+
story_cloze_2016,0,Story Continuation and Options,acc,0.46125066809192944
|
413 |
+
story_cloze_2016,0,median,accuracy,0.47755211117049706
|
414 |
+
story_cloze_2016,1,Answer Given options,acc,0.47140566541956175
|
415 |
+
story_cloze_2016,1,Choose Story Ending,acc,0.48583645109567075
|
416 |
+
story_cloze_2016,1,Novel Correct Ending,acc,0.4820951362907536
|
417 |
+
story_cloze_2016,1,Story Continuation and Options,acc,0.48850881881346875
|
418 |
+
story_cloze_2016,1,median,accuracy,0.4839657936932122
|
419 |
+
story_cloze_2016,2,Answer Given options,acc,0.47728487439871725
|
420 |
+
story_cloze_2016,2,Choose Story Ending,acc,0.48583645109567075
|
421 |
+
story_cloze_2016,2,Novel Correct Ending,acc,0.4853019775521112
|
422 |
+
story_cloze_2016,2,Story Continuation and Options,acc,0.47728487439871725
|
423 |
+
story_cloze_2016,2,median,accuracy,0.48129342597541425
|
424 |
+
story_cloze_2016,3,Answer Given options,acc,0.47247461250668094
|
425 |
+
story_cloze_2016,3,Choose Story Ending,acc,0.4751469802244789
|
426 |
+
story_cloze_2016,3,Novel Correct Ending,acc,0.4794227685729556
|
427 |
+
story_cloze_2016,3,Story Continuation and Options,acc,0.4681988241582042
|
428 |
+
story_cloze_2016,3,median,accuracy,0.4738107963655799
|
429 |
+
story_cloze_2016,4,Answer Given options,acc,0.4730090860502405
|
430 |
+
story_cloze_2016,4,Choose Story Ending,acc,0.47247461250668094
|
431 |
+
story_cloze_2016,4,Novel Correct Ending,acc,0.4831640833778728
|
432 |
+
story_cloze_2016,4,Story Continuation and Options,acc,0.4692677712453234
|
433 |
+
story_cloze_2016,4,median,accuracy,0.47274184927846075
|
434 |
+
story_cloze_2016,5,Answer Given options,acc,0.47033671833244256
|
435 |
+
story_cloze_2016,5,Choose Story Ending,acc,0.4665954035275254
|
436 |
+
story_cloze_2016,5,Novel Correct Ending,acc,0.47888829502939606
|
437 |
+
story_cloze_2016,5,Story Continuation and Options,acc,0.4740780331373597
|
438 |
+
story_cloze_2016,5,median,accuracy,0.4722073757349011
|
439 |
+
story_cloze_2016,5,average,multiple,0.4769285587030109
|
440 |
+
superglue_rte,0,GPT-3 style,acc,0.4404332129963899
|
441 |
+
superglue_rte,0,MNLI crowdsource,acc,0.5523465703971119
|
442 |
+
superglue_rte,0,does it follow that,acc,0.5451263537906137
|
443 |
+
superglue_rte,0,guaranteed true,acc,0.48014440433212996
|
444 |
+
superglue_rte,0,should assume,acc,0.51985559566787
|
445 |
+
superglue_rte,0,median,accuracy,0.51985559566787
|
446 |
+
superglue_rte,1,GPT-3 style,acc,0.5018050541516246
|
447 |
+
superglue_rte,1,MNLI crowdsource,acc,0.49097472924187724
|
448 |
+
superglue_rte,1,does it follow that,acc,0.48736462093862815
|
449 |
+
superglue_rte,1,guaranteed true,acc,0.49097472924187724
|
450 |
+
superglue_rte,1,should assume,acc,0.49097472924187724
|
451 |
+
superglue_rte,1,median,accuracy,0.49097472924187724
|
452 |
+
superglue_rte,2,GPT-3 style,acc,0.5234657039711191
|
453 |
+
superglue_rte,2,MNLI crowdsource,acc,0.5054151624548736
|
454 |
+
superglue_rte,2,does it follow that,acc,0.51985559566787
|
455 |
+
superglue_rte,2,guaranteed true,acc,0.48375451263537905
|
456 |
+
superglue_rte,2,should assume,acc,0.4981949458483754
|
457 |
+
superglue_rte,2,median,accuracy,0.5054151624548736
|
458 |
+
superglue_rte,3,GPT-3 style,acc,0.555956678700361
|
459 |
+
superglue_rte,3,MNLI crowdsource,acc,0.5018050541516246
|
460 |
+
superglue_rte,3,does it follow that,acc,0.5306859205776173
|
461 |
+
superglue_rte,3,guaranteed true,acc,0.5270758122743683
|
462 |
+
superglue_rte,3,should assume,acc,0.516245487364621
|
463 |
+
superglue_rte,3,median,accuracy,0.5270758122743683
|
464 |
+
superglue_rte,4,GPT-3 style,acc,0.5631768953068592
|
465 |
+
superglue_rte,4,MNLI crowdsource,acc,0.47653429602888087
|
466 |
+
superglue_rte,4,does it follow that,acc,0.516245487364621
|
467 |
+
superglue_rte,4,guaranteed true,acc,0.49097472924187724
|
468 |
+
superglue_rte,4,should assume,acc,0.47653429602888087
|
469 |
+
superglue_rte,4,median,accuracy,0.49097472924187724
|
470 |
+
superglue_rte,5,GPT-3 style,acc,0.5631768953068592
|
471 |
+
superglue_rte,5,MNLI crowdsource,acc,0.4584837545126354
|
472 |
+
superglue_rte,5,does it follow that,acc,0.5415162454873647
|
473 |
+
superglue_rte,5,guaranteed true,acc,0.4729241877256318
|
474 |
+
superglue_rte,5,should assume,acc,0.4584837545126354
|
475 |
+
superglue_rte,5,median,accuracy,0.4729241877256318
|
476 |
+
superglue_rte,5,average,multiple,0.5012033694344163
|
477 |
+
web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05344453588119793
|
478 |
+
web_nlg_en,0,explicit-graph-description2,rouge2_fmeasure,0.011184314741657642
|
479 |
+
web_nlg_en,0,implicit-graph-description,rouge2_fmeasure,0.008207247449935154
|
480 |
+
web_nlg_en,0,non-explicit-description,rouge2_fmeasure,0.020385877678818807
|
481 |
+
web_nlg_en,0,very-explicit-description,rouge2_fmeasure,3.843511432044316e-06
|
482 |
+
web_nlg_en,0,median,rouge2_fmeasure,0.011184314741657642
|
483 |
+
web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0604895960614538
|
484 |
+
web_nlg_en,1,explicit-graph-description2,rouge2_fmeasure,0.15888598086244612
|
485 |
+
web_nlg_en,1,implicit-graph-description,rouge2_fmeasure,0.06685938813989681
|
486 |
+
web_nlg_en,1,non-explicit-description,rouge2_fmeasure,0.10697454965926212
|
487 |
+
web_nlg_en,1,very-explicit-description,rouge2_fmeasure,0.08423671425300502
|
488 |
+
web_nlg_en,1,median,rouge2_fmeasure,0.08423671425300502
|
489 |
+
web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.06086364336249341
|
490 |
+
web_nlg_en,2,explicit-graph-description2,rouge2_fmeasure,0.2342525950120652
|
491 |
+
web_nlg_en,2,implicit-graph-description,rouge2_fmeasure,0.09546217670504362
|
492 |
+
web_nlg_en,2,non-explicit-description,rouge2_fmeasure,0.10198081937332562
|
493 |
+
web_nlg_en,2,very-explicit-description,rouge2_fmeasure,0.10461100159118264
|
494 |
+
web_nlg_en,2,median,rouge2_fmeasure,0.10198081937332562
|
495 |
+
web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.06172653863702163
|
496 |
+
web_nlg_en,3,explicit-graph-description2,rouge2_fmeasure,0.2270035578776477
|
497 |
+
web_nlg_en,3,implicit-graph-description,rouge2_fmeasure,0.10072973751222168
|
498 |
+
web_nlg_en,3,non-explicit-description,rouge2_fmeasure,0.10398704909605484
|
499 |
+
web_nlg_en,3,very-explicit-description,rouge2_fmeasure,0.0979814053469226
|
500 |
+
web_nlg_en,3,median,rouge2_fmeasure,0.10072973751222168
|
501 |
+
web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.061883789388597316
|
502 |
+
web_nlg_en,4,explicit-graph-description2,rouge2_fmeasure,0.21128493584064525
|
503 |
+
web_nlg_en,4,implicit-graph-description,rouge2_fmeasure,0.09678025171714386
|
504 |
+
web_nlg_en,4,non-explicit-description,rouge2_fmeasure,0.10086843076345688
|
505 |
+
web_nlg_en,4,very-explicit-description,rouge2_fmeasure,0.09599216387839389
|
506 |
+
web_nlg_en,4,median,rouge2_fmeasure,0.09678025171714386
|
507 |
+
web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.06197974009288303
|
508 |
+
web_nlg_en,5,explicit-graph-description2,rouge2_fmeasure,0.20065053485376905
|
509 |
+
web_nlg_en,5,implicit-graph-description,rouge2_fmeasure,0.09493898136075185
|
510 |
+
web_nlg_en,5,non-explicit-description,rouge2_fmeasure,0.09425690323781327
|
511 |
+
web_nlg_en,5,very-explicit-description,rouge2_fmeasure,0.0949140921236195
|
512 |
+
web_nlg_en,5,median,rouge2_fmeasure,0.0949140921236195
|
513 |
+
web_nlg_en,5,average,multiple,0.08163765495349555
|
514 |
+
wiki_lingua_en,0,article_summary_en,rouge2_fmeasure,0.0397636720249358
|
515 |
+
wiki_lingua_en,0,rephrase_en,rouge2_fmeasure,0.012721191178839145
|
516 |
+
wiki_lingua_en,0,summarize_above_en,rouge2_fmeasure,0.024434310470910135
|
517 |
+
wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03327297097578151
|
518 |
+
wiki_lingua_en,0,write_abstract_en,rouge2_fmeasure,0.011216041205494898
|
519 |
+
wiki_lingua_en,0,median,rouge2_fmeasure,0.024434310470910135
|
520 |
+
wiki_lingua_en,1,article_summary_en,rouge2_fmeasure,0.042828281320032364
|
521 |
+
wiki_lingua_en,1,rephrase_en,rouge2_fmeasure,0.02711335795670395
|
522 |
+
wiki_lingua_en,1,summarize_above_en,rouge2_fmeasure,0.03141526988382421
|
523 |
+
wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.056766090400891124
|
524 |
+
wiki_lingua_en,1,write_abstract_en,rouge2_fmeasure,0.021015379090322476
|
525 |
+
wiki_lingua_en,1,median,rouge2_fmeasure,0.03141526988382421
|
526 |
+
wiki_lingua_en,2,article_summary_en,rouge2_fmeasure,0.051091351936466585
|
527 |
+
wiki_lingua_en,2,rephrase_en,rouge2_fmeasure,0.04454347983688272
|
528 |
+
wiki_lingua_en,2,summarize_above_en,rouge2_fmeasure,0.048234719869698274
|
529 |
+
wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.057748452491246806
|
530 |
+
wiki_lingua_en,2,write_abstract_en,rouge2_fmeasure,0.01856227196277119
|
531 |
+
wiki_lingua_en,2,median,rouge2_fmeasure,0.048234719869698274
|
532 |
+
wiki_lingua_en,3,article_summary_en,rouge2_fmeasure,0.04482124580351551
|
533 |
+
wiki_lingua_en,3,rephrase_en,rouge2_fmeasure,0.03895725296191616
|
534 |
+
wiki_lingua_en,3,summarize_above_en,rouge2_fmeasure,0.04242364396606276
|
535 |
+
wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04689779702656875
|
536 |
+
wiki_lingua_en,3,write_abstract_en,rouge2_fmeasure,0.013461430147774255
|
537 |
+
wiki_lingua_en,3,median,rouge2_fmeasure,0.04242364396606276
|
538 |
+
wiki_lingua_en,4,article_summary_en,rouge2_fmeasure,0.014133038738676246
|
539 |
+
wiki_lingua_en,4,rephrase_en,rouge2_fmeasure,0.012100846520590037
|
540 |
+
wiki_lingua_en,4,summarize_above_en,rouge2_fmeasure,0.012171454699532493
|
541 |
+
wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.013741746630537094
|
542 |
+
wiki_lingua_en,4,write_abstract_en,rouge2_fmeasure,0.003266077788148105
|
543 |
+
wiki_lingua_en,4,median,rouge2_fmeasure,0.012171454699532493
|
544 |
+
wiki_lingua_en,5,article_summary_en,rouge2_fmeasure,0.002017912016348425
|
545 |
+
wiki_lingua_en,5,rephrase_en,rouge2_fmeasure,0.0018127250079443033
|
546 |
+
wiki_lingua_en,5,summarize_above_en,rouge2_fmeasure,0.0013784838984500552
|
547 |
+
wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0023512305693387013
|
548 |
+
wiki_lingua_en,5,write_abstract_en,rouge2_fmeasure,0.0001630519530024726
|
549 |
+
wiki_lingua_en,5,median,rouge2_fmeasure,0.0018127250079443033
|
550 |
+
wiki_lingua_en,5,average,multiple,0.026748687316328696
|
551 |
+
winogrande,0,Replace,acc,0.500394632991318
|
552 |
+
winogrande,0,True or False,acc,0.494869771112865
|
553 |
+
winogrande,0,does underscore refer to,acc,0.4696132596685083
|
554 |
+
winogrande,0,stand for,acc,0.49171270718232046
|
555 |
+
winogrande,0,underscore refer to,acc,0.49171270718232046
|
556 |
+
winogrande,0,median,accuracy,0.49171270718232046
|
557 |
+
winogrande,1,Replace,acc,0.5035516969218626
|
558 |
+
winogrande,1,True or False,acc,0.4925019731649566
|
559 |
+
winogrande,1,does underscore refer to,acc,0.4909234411996843
|
560 |
+
winogrande,1,stand for,acc,0.4956590370955012
|
561 |
+
winogrande,1,underscore refer to,acc,0.47908445146014206
|
562 |
+
winogrande,1,median,accuracy,0.4925019731649566
|
563 |
+
winogrande,2,Replace,acc,0.5067087608524072
|
564 |
+
winogrande,2,True or False,acc,0.5074980268350434
|
565 |
+
winogrande,2,does underscore refer to,acc,0.48303078137332284
|
566 |
+
winogrande,2,stand for,acc,0.4909234411996843
|
567 |
+
winogrande,2,underscore refer to,acc,0.49171270718232046
|
568 |
+
winogrande,2,median,accuracy,0.49171270718232046
|
569 |
+
winogrande,3,Replace,acc,0.5217048145224941
|
570 |
+
winogrande,3,True or False,acc,0.5067087608524072
|
571 |
+
winogrande,3,does underscore refer to,acc,0.494869771112865
|
572 |
+
winogrande,3,stand for,acc,0.4980268350434096
|
573 |
+
winogrande,3,underscore refer to,acc,0.5138121546961326
|
574 |
+
winogrande,3,median,accuracy,0.5067087608524072
|
575 |
+
winogrande,4,Replace,acc,0.5177584846093133
|
576 |
+
winogrande,4,True or False,acc,0.5059194948697711
|
577 |
+
winogrande,4,does underscore refer to,acc,0.49171270718232046
|
578 |
+
winogrande,4,stand for,acc,0.5059194948697711
|
579 |
+
winogrande,4,underscore refer to,acc,0.5177584846093133
|
580 |
+
winogrande,4,median,accuracy,0.5059194948697711
|
581 |
+
winogrande,5,Replace,acc,0.5193370165745856
|
582 |
+
winogrande,5,True or False,acc,0.5043409629044988
|
583 |
+
winogrande,5,does underscore refer to,acc,0.4996053670086819
|
584 |
+
winogrande,5,stand for,acc,0.4988161010260458
|
585 |
+
winogrande,5,underscore refer to,acc,0.5035516969218626
|
586 |
+
winogrande,5,median,accuracy,0.5035516969218626
|
587 |
+
winogrande,5,average,multiple,0.4986845566956064
|
4b284b42bc4/eval/merged.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_2.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5219512537266193, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0212930030956169}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07218926971887347, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012167661516642302}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.38930719643519357, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004954916384960566}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11405231602258174, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001654644088764185}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03343887788145288, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007607255949351679}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1941492016104801, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003831607721687585}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.0530914234584236, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010851048561266318}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06799635945918656, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011456716958398708}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.36060457316843075, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004393491915214021}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10723485397749022, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015594878594255188}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06917630307565754, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011778495782492331}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3693625063067186, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004599410126151537}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10913009935368997, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015983050972786382}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6352070705519831, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.032408525336239634}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07232274024412362, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011648296294780197}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.40875337262130357, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005170431645641866}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11545860904836557, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016293321313512947}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03357621675403705, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007258241400547502}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.20421979666303372, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0038705321992327218}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05387918484618448, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001059642873696044}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.0670219793734058, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010711080482606746}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3719335535759742, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004473658738704575}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10673572587703606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001490340938407792}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06880374952093221, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011085999525674158}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.38592664938862503, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004766870365189962}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10969517770602641, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015436923533265093}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6721480622931579, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04364421620033658}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07312830538850522, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012172573610058964}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.41786672242074735, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00514089320113892}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11682333840421252, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016784638187503072}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03391327016863078, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007434663573137818}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.21097279270777233, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003962368460085698}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05451282884342794, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001075049065599178}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06674973450321794, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001076625707211798}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3763932336189791, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004418572944831253}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10645004232970325, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014835449120631779}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06947768209278911, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011601752390396401}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.39433935812714066, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00474918433328278}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1108401266606722, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015949626703580934}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_2.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.45946097160655536, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0069054289444320065}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.4838082934841815, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005353612871054305}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.40318520685428916, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0050543331024213555}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.25981848081805536, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005464287303749197}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.2614431240216551, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004532273813349631}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.22117893046486378, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004159001287103087}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.3790825033320128, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.006113945088066168}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.40459178460001194, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004829957825352463}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.3311134882286364, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004433468116819751}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.40522430638291235, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.006350898587976966}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4260396970904746, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004984787219803331}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.35357342466071756, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004606600028590757}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 4.728695653942699, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.20951842561350045}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.5132210909931271, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006770200037421206}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5085672940783099, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005219014740462864}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.44649942768587786, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004938340775431664}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.2962906481288477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005484844549091213}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.28550066477044644, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004627831211110698}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.2516831348306221, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004237671368186907}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.42385682852035556, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.006080173471955614}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4235294811450316, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004789452676639955}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.3673926906028998, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004434765375125032}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.4544084219641523, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.006266153839723712}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.44938547317564664, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004876193487205648}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.39330123509502896, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004530237812729094}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 6.212392173804924, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.3162249116972436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.5199440915994101, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006624486616712392}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5191675780106996, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0050701091896996385}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.4537703875613086, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004716616160390303}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.3031051480032843, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005329062576870483}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.2963979153791751, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0044953737902237645}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.2576884269938096, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004005629924060058}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.4266291186121007, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005898467057932758}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4282197768995522, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004680222294006507}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.36978135277630003, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004192067697375704}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.4599896912593643, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.006130864600093757}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4576419090445212, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004749819488017943}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.39852235622906906, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004308718484971879}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 6.5704316145072506, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.21448561410544192}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_2.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.5083670715467508, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04793752313730723}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.13731115268889618, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004134328663985652}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5304312650087452, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005137749962807634}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.1770932101083039, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032503426283328517}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.07057132542466306, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002672953645282832}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.2756854307207382, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00443727730222414}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.08858118067497767, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022195618622123207}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.11318481239908815, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034285009887527296}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.45774658527361894, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004751494427100153}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.14699962330240496, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002639563461886689}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.12133841288359587, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036717057968418485}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4712273535434232, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004840505439839539}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.15663423920541245, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00293591591942458}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.5689932623794205, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03487597258964387}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.15135031922863815, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0045240430311319964}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5274677588088789, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005168283267163321}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.18658181154870737, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032951371932804074}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.0812976310351736, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003052517248887665}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.2876975324614106, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004539489201779465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.09696397066162622, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023123478544169115}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.12650520735598997, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003931360833531472}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.45320651913396676, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004927916099837654}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.15529696958918174, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027889786826169605}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.13503878881995351, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004117933379311923}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4710862176395619, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004955060111554295}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.16572981825064162, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0029891616403949226}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.5666625357618493, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.035144502313160096}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.16225163818440333, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004930277023326987}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5198390606155927, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005004253324448323}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.19269563367126713, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034429019094789712}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.08999073165958532, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0034053408100276723}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.290142216690991, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004435580660257063}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.10319722625311183, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002497320165770724}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.13522779384865216, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004269818421199203}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4424221249100704, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004698632856913699}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.15984455179709206, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002959046280237486}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.14570044951057284, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004496072255677459}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.46658013872741555, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004743391558616772}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.17246353477949722, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0031557973499711975}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_2.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.1819238915458583, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002750353826660657}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.7029413363719536, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003808929381311445}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.26990147215432525, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029611077960044795}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.08828034705691859, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001756213173675814}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.36957211038213966, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004295231025460387}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.1321765392862873, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002035893685578508}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.1395246877294493, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020982257188054973}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5674658932955065, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004066884538719425}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.20873220289543726, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022417883144256346}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.15727492631630607, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024406377529898574}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.6165732680882078, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037569757915729735}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.23372715316587211, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026271080876903194}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.7848178181585044, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11165273099003932}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.16916143883050352, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002379872667769485}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6810622565859463, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004015102382030035}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.25585929817242026, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027996781558269953}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.08144241568678086, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014775734418982862}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3576674736285389, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004354118064844373}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.12475959136600752, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019179222108368782}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.12901201268176216, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017552360200443738}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.547948402844493, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004162509193062588}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.19697428121356045, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021070987747823585}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.1469702815726482, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002085618531486058}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.6026159583567549, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003950263273368332}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.22282022865304935, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024572204259037207}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.713664635412786, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08533018956868954}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.1578995517139064, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002188262308646243}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.661260656597642, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004201918635105074}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.24049035651787887, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00265323510540592}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.0765689099285028, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013983221970486452}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3518820790483638, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004418107440625639}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.11816403232435718, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018236102355301655}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.12155855741792118, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016664561371413115}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5312655407554171, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004223895153456735}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.18655013034032525, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020589747081609495}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.13827095197584383, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019334746802031329}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5862691174388023, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004033599895639771}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.21089639595956103, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023465624968102092}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.469805707725389, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06490777021012714}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_2.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.17794283547769393, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004855332389330506}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6738492336641515, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004349540444134296}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.22783710711426441, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038760861020319783}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.09239931377519814, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0033174852810728913}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.3553116109965894, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004440024800683586}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.11404511235265344, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0026899526031591296}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.1419890688901028, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004098782466396125}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5573937424840384, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004380416051067475}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.18086674415244586, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003193856695239382}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.15719935748984376, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00433118114964069}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6040019474400884, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004432909544722048}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.20152895642827887, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003482992973760569}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 2.42884817219882, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.099116139881655}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.18751101358462757, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005063466765827528}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6803997964079052, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004146984246148592}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.23996034128160132, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004098823532354845}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.09981197167240217, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0034490088940847723}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.363206517232566, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0044053972626213736}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.1236406159290505, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0029657970432989938}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.14948107867414953, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00430849430541505}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5591640891574615, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004222675152018815}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.19036064787323584, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0034698000115775366}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.16593161383927957, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004490090618309719}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6127549245098377, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0042508539477660486}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.21344647829746557, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0036991334796597887}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 2.648377702538737, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08478382467689291}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.2170502412040914, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0056785623479505346}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6707499226771557, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004197291013190362}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.26187418406645563, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004479195310716335}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.12026175423850448, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004116366667789958}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.35899246341620966, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004337077760186949}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.13864465330888276, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0034090024819216245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.1743575606183526, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0049703919122685986}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5431873257825337, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004174698396559969}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.20774552707149974, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003911098641553765}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.19370196026746925, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005146435869582002}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6067084039239514, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004228274627594922}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.23391725441186811, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0040594860490940805}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 2.657390816846655, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07493284644080055}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_2.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.18480731121362143, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020792576907124544}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.315121891878082, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00281956327717947}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.21546625995717986, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019044373825407577}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.043591724134436294, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009100610866868915}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.07663865850457913, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016683677543594749}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.05085522769122296, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009806547557348483}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.13136007081848963, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001407474447704256}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.2315354297581157, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022376304823093543}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1545786655331593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001294688576797968}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.17135315593483375, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019310536492561348}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.29331614741662265, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002671841052773937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.19998174393388882, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017739334682302418}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.4986435841452175, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0954009333366125}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.16054946339149265, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023872806268002453}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.26124779652266183, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0034109215483210933}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.18055709030131595, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002291239666041859}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03844271053495923, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009548717405153122}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06572671257249793, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001676567990321315}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.043685786411164565, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009951233609164975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.1146885610289566, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016607411459830235}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.19312647848640802, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026528744495399642}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1299739944513172, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016020857996698003}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.14933338107720576, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022223746600941365}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.24353771456671877, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032013282199569457}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.16805250413658923, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002137202674713072}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.6906547201013016, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10561504116700811}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.05199770725713768, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018998031112635958}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.08654478337265463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003030755442148775}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.05713385129965973, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019394092460641402}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.011716489586243313, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006188408595148423}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.021706213807673947, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001160694947893896}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.013403383907298045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006520713115630765}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.03835312744448057, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014121874773050304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.06471677863725485, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002320636959830996}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.041882673914681416, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014105973125080264}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.048224676522694826, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017648142477153747}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.0803093006676827, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002819219094971394}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.05296871705271611, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018034377431592255}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 0.5289185378545499, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03725901160237639}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_2.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.16287938762344858, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002125596227207644}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.26891683127031046, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029264720716827195}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.1876830793081277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020430243309695934}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.0396793386777498, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009455884698469472}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.06791041476534408, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016416822093375813}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.045953265934537524, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010171810986012006}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.12593110207935895, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014937847050657449}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.2160718901557372, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024089218033376556}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.1469663361895663, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014726919857113133}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.15017973881687044, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001965190714131917}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.2491465690438633, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002748477636582947}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.17332222163527497, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018961374186126187}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.8244022964598483, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07935443727949992}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.14381543741495909, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023667032912517466}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.225629388401208, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003254182304827278}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.15868452002960928, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002240691662636904}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.035731574000179026, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010349488403740507}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.05816887351927841, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016059454698874895}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.039489525521791025, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000990906960301999}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.1126446519610345, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017869301538348855}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.18198894030060583, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002668023279693901}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.1251846925684636, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016856714050454754}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.13318454491534204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022141082314480645}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.20977905263291247, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030675162093798812}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.14690611817944446, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020806139758284456}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.9288842585821677, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08545851002343076}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.046670256378216396, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018631696503052615}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.07269676929849611, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002709598303284025}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.04931235783073529, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018027220859303127}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.011045473611349471, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006328455956303861}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.019434558915915258, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001153106186544495}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.012225365615349182, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006470992700405894}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.03764501152188949, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00147394811246321}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.060278823007865176, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022646729396056334}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.04000155130112757, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014299390749652758}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.04298247190464882, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001738069209317656}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.06679218344108083, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025083789140632304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.04523563611791948, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001662538594980625}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.49677796073841696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.052625922121863185}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_2.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.18099679964244167, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022642402116103914}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.2981323814293587, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00283223986352777}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.20682521295592035, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001994458501946162}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.04315528231168242, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009826962822616337}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.07284671298198961, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001691310520894889}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.04926653251882711, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001025431194341212}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.13913612901543207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001621884412931162}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.2382490384217888, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002334550886161128}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.16091879232181885, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014094494994621246}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.16710754510866463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002120469796828209}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.27654018110872997, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002684831096463094}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.1910349072329293, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018521715266141294}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.8561556111879174, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0728982607399944}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.1589750600600723, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026278503972424322}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.24856960492614927, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0034290945996938577}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.17315198648906838, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023395404402379518}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.04013687657396167, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011383641303593693}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.0650129563997214, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017038204023901537}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.04375734535627615, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010346234714461327}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.12283468138914441, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019880477155036994}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.19794072747052693, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002794014068329132}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.13485492202044252, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017498822051336784}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.14708276519326932, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002450731932737247}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.23071550304096008, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003222080601723914}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.16017715054289589, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021714589994061635}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 3.1365248222604922, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11628221415118109}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.049068436227218, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001993000314908942}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.0775740659139895, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029232679441589045}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.051872966574056024, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019061734901385107}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.01199072000032627, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007398590703300153}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.0204455235276103, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012282519026501563}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.012765677291574816, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006703661625741191}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.038829622758228356, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015610696154773989}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.0634321576713279, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002441124896735242}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.041302516354620705, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014872448773892877}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.045154904255880535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018406863174026315}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.07128105012283013, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027050833760683883}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.04760963919336614, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017500349071780813}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.48599285663479985, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04001588316652065}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_2.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2022231395663083, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002327440444373304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.3212542850069266, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028325100234559927}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.22687895073683426, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019722087479110005}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.05298207477606098, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011059450545409227}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0867754880829245, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018124553730314602}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05947240390341843, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011148780082595852}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.14766643493041728, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016788338597698632}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.24166027084574745, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023102550180774937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1667829458147027, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013918547568942457}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.18955567145755783, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021818874452280475}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.30206706797305843, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026937255424983246}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.21293287161862917, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018545411601376014}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.26946391692664, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06690865495365421}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.17603411917572567, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027088089976067017}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2612353818883475, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033992744560135033}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.18692561650624942, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023335888686542303}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.04559336904702604, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012112817237957673}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06990530615420809, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017473693679177455}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04827221140963156, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001080517273254867}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.13188323013007386, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020767846091407608}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.19847294800292903, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002681619286646281}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13971894217937625, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017066104221784629}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.1648347813541367, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0025407224451518312}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.24568842200265278, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032363366658664243}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.17522886133321208, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021928506565670742}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.0982581904465443, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09308367989527225}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.05811148139738531, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022224537018629854}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.08558209366311444, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003003751048628007}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.058933509019544375, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002002584327257243}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.014672079613835081, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008619146516289935}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.02381134201045262, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012808609018768735}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.015382205296321172, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000747616978372682}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.04421230588130963, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017253556363268328}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.06610553705292643, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002375418784846169}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04464141924307046, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015078323046653491}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.05425853906264316, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002080511020141031}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.07994658087674156, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028206819757526707}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05503716623523508, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018776883247322259}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.6565411757725536, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.052324362211905354}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_2.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.11186372200442046, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018671387732113079}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.17879552912162308, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027643379350067035}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.12691648074851333, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001896734902040353}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.01717084786190784, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006542423399459584}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.029277153235119115, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011365556453727173}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.019772682308363394, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007124577758867392}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.08411171356077761, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012658897573498854}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.13787036917147713, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020240765955970865}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.09604059787645051, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001274636062224721}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.10367309622865617, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017302566309302105}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.16661914846277825, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025842506153635845}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.11782155638903533, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017597196977228665}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.0455288685483874, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.047635370761101796}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_3.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.09476998627330442, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002049387309515581}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.14233990548855194, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028599756389382966}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.10224304189369153, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019828555273926772}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.01579253403515002, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000714960507379995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.025370051032367204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010971245243116302}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.017441204975488303, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006872287960301365}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.0726982989153829, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001524607663698719}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.11120691117595559, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021760779267226923}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.07842574917248762, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014189355674646296}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.08821003752248946, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019202411693044791}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.13254544508118085, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002664772195130541}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.09504316855119928, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018454346298717522}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.2230608250631039, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08212157026799911}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.02400887856928141, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012801233338843008}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.03676848661205508, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0018777990211272366}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.0251536532172704, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012297831296573945}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0040671748623580355, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004261458773707568}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.006848332649418315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0007065553357872231}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.004349993193557413, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003848818988530138}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.018902837570424644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009887113843588492}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.029393904053863548, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0014777305097725733}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.019722039262009786, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009212732368853081}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.02224536000159842, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012029315455249722}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.03371996491485194, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001725783795993067}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.023124878533727647, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001135976545095596}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.08481215880086364, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.008615953892796454}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.341, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014998131348402709}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.309, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014619600977206494}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_5.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932573}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.308, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014606483127342761}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.359, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015177264224798594}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.357, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015158521721486769}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_5.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.347, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015060472031706617}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422986}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811483}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.321, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934649}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_5.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.324, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01480686473373886}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.318, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014734079309311901}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.329, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928374}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_5.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.322, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01478291360099667}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.314, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01468399195108797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_4.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.34, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014987482264363937}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811483}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
|