Muennighoff commited on
Commit
6603402
·
1 Parent(s): 862213b
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +153 -0
  2. 4b284b12bc4/eval/merged.csv +587 -0
  3. 4b284b12bc4/eval/merged.json +0 -0
  4. 4b284b17bc4/eval/merged.csv +587 -0
  5. 4b284b17bc4/eval/merged.json +0 -0
  6. 4b284b21bc4/eval/merged.csv +587 -0
  7. 4b284b21bc4/eval/merged.json +0 -0
  8. 4b284b28bc4/eval/merged.csv +587 -0
  9. 4b284b28bc4/eval/merged.json +0 -0
  10. 4b284b42bc4/eval/merged.csv +587 -0
  11. 4b284b42bc4/eval/merged.json +0 -0
  12. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_2.json +1 -0
  13. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_3.json +1 -0
  14. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_4.json +1 -0
  15. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_2.json +1 -0
  16. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_3.json +1 -0
  17. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_4.json +1 -0
  18. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_2.json +1 -0
  19. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_3.json +1 -0
  20. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_4.json +1 -0
  21. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_2.json +1 -0
  22. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_3.json +1 -0
  23. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_4.json +1 -0
  24. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_2.json +1 -0
  25. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_3.json +1 -0
  26. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_4.json +1 -0
  27. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_2.json +1 -0
  28. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_3.json +1 -0
  29. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_4.json +1 -0
  30. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_2.json +1 -0
  31. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_3.json +1 -0
  32. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_4.json +1 -0
  33. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_2.json +1 -0
  34. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_3.json +1 -0
  35. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_4.json +1 -0
  36. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_2.json +1 -0
  37. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_3.json +1 -0
  38. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_4.json +1 -0
  39. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_2.json +1 -0
  40. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_3.json +1 -0
  41. 4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_4.json +1 -0
  42. 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_4.json +1 -0
  43. 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_5.json +1 -0
  44. 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_4.json +1 -0
  45. 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_5.json +1 -0
  46. 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_4.json +1 -0
  47. 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_5.json +1 -0
  48. 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_4.json +1 -0
  49. 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_5.json +1 -0
  50. 4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_4.json +1 -0
.gitattributes CHANGED
@@ -2939,3 +2939,156 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
2939
  4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
2940
  4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text
2941
  4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2939
  4b284b42bc4/eval/examples.4b284b42bc4_anli_r1_guaranteed-possible-impossible_1.jsonl filter=lfs diff=lfs merge=lfs -text
2940
  4b284b17bc4/eval/examples.4b284b17bc4_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text
2941
  4b284b21bc4/eval/examples.4b284b21bc4_e2e_nlg_cleaned_coherent_text_2.jsonl filter=lfs diff=lfs merge=lfs -text
2942
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text
2943
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
2944
+ 4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
2945
+ 4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text
2946
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
2947
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
2948
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
2949
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
2950
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text
2951
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
2952
+ 4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_4.jsonl filter=lfs diff=lfs merge=lfs -text
2953
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
2954
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
2955
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
2956
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_5.jsonl filter=lfs diff=lfs merge=lfs -text
2957
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
2958
+ 4b284b84bc4/eval/examples.4b284b84bc4_copa_choose_5.jsonl filter=lfs diff=lfs merge=lfs -text
2959
+ 4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_4.jsonl filter=lfs diff=lfs merge=lfs -text
2960
+ 4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text
2961
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text
2962
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
2963
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
2964
+ 4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_5.jsonl filter=lfs diff=lfs merge=lfs -text
2965
+ 4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
2966
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
2967
+ 4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl filter=lfs diff=lfs merge=lfs -text
2968
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
2969
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
2970
+ 4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
2971
+ 4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
2972
+ 4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_5.jsonl filter=lfs diff=lfs merge=lfs -text
2973
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
2974
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
2975
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
2976
+ 4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_5.jsonl filter=lfs diff=lfs merge=lfs -text
2977
+ 4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_5.jsonl filter=lfs diff=lfs merge=lfs -text
2978
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
2979
+ 4b284b84bc4/eval/examples.4b284b84bc4_winogrande_underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text
2980
+ 4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_5.jsonl filter=lfs diff=lfs merge=lfs -text
2981
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
2982
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
2983
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
2984
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
2985
+ 4b284b84bc4/eval/examples.4b284b84bc4_copa_plausible_alternatives_5.jsonl filter=lfs diff=lfs merge=lfs -text
2986
+ 4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text
2987
+ 4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_4.jsonl filter=lfs diff=lfs merge=lfs -text
2988
+ 4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_4.jsonl filter=lfs diff=lfs merge=lfs -text
2989
+ 4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question_5.jsonl filter=lfs diff=lfs merge=lfs -text
2990
+ 4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_5.jsonl filter=lfs diff=lfs merge=lfs -text
2991
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
2992
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_pick_the_most_correct_option_4.jsonl filter=lfs diff=lfs merge=lfs -text
2993
+ 4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
2994
+ 4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
2995
+ 4b284b84bc4/eval/examples.4b284b84bc4_copa_cause_effect_5.jsonl filter=lfs diff=lfs merge=lfs -text
2996
+ 4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
2997
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
2998
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
2999
+ 4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_5.jsonl filter=lfs diff=lfs merge=lfs -text
3000
+ 4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_4.jsonl filter=lfs diff=lfs merge=lfs -text
3001
+ 4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
3002
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
3003
+ 4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl filter=lfs diff=lfs merge=lfs -text
3004
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
3005
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_4.jsonl filter=lfs diff=lfs merge=lfs -text
3006
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
3007
+ 4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_4.jsonl filter=lfs diff=lfs merge=lfs -text
3008
+ 4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
3009
+ 4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
3010
+ 4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
3011
+ 4b284b84bc4/eval/examples.4b284b84bc4_winogrande_Replace_4.jsonl filter=lfs diff=lfs merge=lfs -text
3012
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
3013
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
3014
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
3015
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r1_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
3016
+ 4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text
3017
+ 4b284b84bc4/eval/examples.4b284b84bc4_piqa_what_is_the_correct_ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
3018
+ 4b284b84bc4/eval/examples.4b284b84bc4_sciq_Direct-Question-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text
3019
+ 4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_5.jsonl filter=lfs diff=lfs merge=lfs -text
3020
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
3021
+ 4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_5.jsonl filter=lfs diff=lfs merge=lfs -text
3022
+ 4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_4.jsonl filter=lfs diff=lfs merge=lfs -text
3023
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
3024
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text
3025
+ 4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_5.jsonl filter=lfs diff=lfs merge=lfs -text
3026
+ 4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Story-Continuation-and-Options_4.jsonl filter=lfs diff=lfs merge=lfs -text
3027
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
3028
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_5.jsonl filter=lfs diff=lfs merge=lfs -text
3029
+ 4b284b84bc4/eval/examples.4b284b84bc4_copa_i_am_hesitating_5.jsonl filter=lfs diff=lfs merge=lfs -text
3030
+ 4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_5.jsonl filter=lfs diff=lfs merge=lfs -text
3031
+ 4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text
3032
+ 4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
3033
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
3034
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
3035
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_4.jsonl filter=lfs diff=lfs merge=lfs -text
3036
+ 4b284b84bc4/eval/examples.4b284b84bc4_cb_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
3037
+ 4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_4.jsonl filter=lfs diff=lfs merge=lfs -text
3038
+ 4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
3039
+ 4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Novel-Correct-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
3040
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl filter=lfs diff=lfs merge=lfs -text
3041
+ 4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-(Closed-Book)_4.jsonl filter=lfs diff=lfs merge=lfs -text
3042
+ 4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_5.jsonl filter=lfs diff=lfs merge=lfs -text
3043
+ 4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_4.jsonl filter=lfs diff=lfs merge=lfs -text
3044
+ 4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_4.jsonl filter=lfs diff=lfs merge=lfs -text
3045
+ 4b284b84bc4/eval/examples.4b284b84bc4_cb_GPT-3-style_4.jsonl filter=lfs diff=lfs merge=lfs -text
3046
+ 4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
3047
+ 4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text
3048
+ 4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_4.jsonl filter=lfs diff=lfs merge=lfs -text
3049
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_4.jsonl filter=lfs diff=lfs merge=lfs -text
3050
+ 4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_5.jsonl filter=lfs diff=lfs merge=lfs -text
3051
+ 4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
3052
+ 4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_4.jsonl filter=lfs diff=lfs merge=lfs -text
3053
+ 4b284b84bc4/eval/examples.4b284b84bc4_winogrande_stand-for_4.jsonl filter=lfs diff=lfs merge=lfs -text
3054
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text
3055
+ 4b284b84bc4/eval/examples.4b284b84bc4_copa_best_option_5.jsonl filter=lfs diff=lfs merge=lfs -text
3056
+ 4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_should-assume_5.jsonl filter=lfs diff=lfs merge=lfs -text
3057
+ 4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_4.jsonl filter=lfs diff=lfs merge=lfs -text
3058
+ 4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_5.jsonl filter=lfs diff=lfs merge=lfs -text
3059
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl filter=lfs diff=lfs merge=lfs -text
3060
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
3061
+ 4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
3062
+ 4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_5.jsonl filter=lfs diff=lfs merge=lfs -text
3063
+ 4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_guaranteed-true_5.jsonl filter=lfs diff=lfs merge=lfs -text
3064
+ 4b284b84bc4/eval/examples.4b284b84bc4_winogrande_does-underscore-refer-to_5.jsonl filter=lfs diff=lfs merge=lfs -text
3065
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl filter=lfs diff=lfs merge=lfs -text
3066
+ 4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Choose-Story-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
3067
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r3_justified-in-saying_4.jsonl filter=lfs diff=lfs merge=lfs -text
3068
+ 4b284b84bc4/eval/examples.4b284b84bc4_cb_can-we-infer_4.jsonl filter=lfs diff=lfs merge=lfs -text
3069
+ 4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_5.jsonl filter=lfs diff=lfs merge=lfs -text
3070
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_justified-in-saying_5.jsonl filter=lfs diff=lfs merge=lfs -text
3071
+ 4b284b84bc4/eval/examples.4b284b84bc4_cb_MNLI-crowdsource_4.jsonl filter=lfs diff=lfs merge=lfs -text
3072
+ 4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_4.jsonl filter=lfs diff=lfs merge=lfs -text
3073
+ 4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_5.jsonl filter=lfs diff=lfs merge=lfs -text
3074
+ 4b284b84bc4/eval/examples.4b284b84bc4_winogrande_True-or-False_5.jsonl filter=lfs diff=lfs merge=lfs -text
3075
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_5.jsonl filter=lfs diff=lfs merge=lfs -text
3076
+ 4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_does-it-follow-that_5.jsonl filter=lfs diff=lfs merge=lfs -text
3077
+ 4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text
3078
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text
3079
+ 4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_4.jsonl filter=lfs diff=lfs merge=lfs -text
3080
+ 4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_4.jsonl filter=lfs diff=lfs merge=lfs -text
3081
+ 4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_5.jsonl filter=lfs diff=lfs merge=lfs -text
3082
+ 4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text
3083
+ 4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Generate-Ending_4.jsonl filter=lfs diff=lfs merge=lfs -text
3084
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_4.jsonl filter=lfs diff=lfs merge=lfs -text
3085
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_challenge_multiple_choice_4.jsonl filter=lfs diff=lfs merge=lfs -text
3086
+ 4b284b84bc4/eval/examples.4b284b84bc4_piqa_choose-the-most-appropriate-solution_4.jsonl filter=lfs diff=lfs merge=lfs -text
3087
+ 4b284b84bc4/eval/examples.4b284b84bc4_superglue_rte_GPT-3-style_5.jsonl filter=lfs diff=lfs merge=lfs -text
3088
+ 4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl filter=lfs diff=lfs merge=lfs -text
3089
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_4.jsonl filter=lfs diff=lfs merge=lfs -text
3090
+ 4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_qa_options_4.jsonl filter=lfs diff=lfs merge=lfs -text
3091
+ 4b284b84bc4/eval/examples.4b284b84bc4_cb_guaranteed-possible-impossible_5.jsonl filter=lfs diff=lfs merge=lfs -text
3092
+ 4b284b84bc4/eval/examples.4b284b84bc4_piqa_pick_correct_choice_index_4.jsonl filter=lfs diff=lfs merge=lfs -text
3093
+ 4b284b84bc4/eval/examples.4b284b84bc4_story_cloze_2016_Answer-Given-options_5.jsonl filter=lfs diff=lfs merge=lfs -text
3094
+ 4b284b84bc4/eval/examples.4b284b84bc4_anli_r2_MNLI-crowdsource_5.jsonl filter=lfs diff=lfs merge=lfs -text
4b284b12bc4/eval/merged.csv ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ anli_r1,0,GPT-3 style,acc,0.334
3
+ anli_r1,0,MNLI crowdsource,acc,0.334
4
+ anli_r1,0,can we infer,acc,0.336
5
+ anli_r1,0,guaranteed/possible/impossible,acc,0.323
6
+ anli_r1,0,justified in saying,acc,0.329
7
+ anli_r1,0,median,accuracy,0.334
8
+ anli_r1,1,GPT-3 style,acc,0.334
9
+ anli_r1,1,MNLI crowdsource,acc,0.333
10
+ anli_r1,1,can we infer,acc,0.325
11
+ anli_r1,1,guaranteed/possible/impossible,acc,0.33
12
+ anli_r1,1,justified in saying,acc,0.327
13
+ anli_r1,1,median,accuracy,0.33
14
+ anli_r1,2,GPT-3 style,acc,0.349
15
+ anli_r1,2,MNLI crowdsource,acc,0.361
16
+ anli_r1,2,can we infer,acc,0.352
17
+ anli_r1,2,guaranteed/possible/impossible,acc,0.323
18
+ anli_r1,2,justified in saying,acc,0.345
19
+ anli_r1,2,median,accuracy,0.349
20
+ anli_r1,3,GPT-3 style,acc,0.33
21
+ anli_r1,3,MNLI crowdsource,acc,0.335
22
+ anli_r1,3,can we infer,acc,0.345
23
+ anli_r1,3,guaranteed/possible/impossible,acc,0.32
24
+ anli_r1,3,justified in saying,acc,0.349
25
+ anli_r1,3,median,accuracy,0.335
26
+ anli_r1,4,GPT-3 style,acc,0.318
27
+ anli_r1,4,MNLI crowdsource,acc,0.332
28
+ anli_r1,4,can we infer,acc,0.327
29
+ anli_r1,4,guaranteed/possible/impossible,acc,0.309
30
+ anli_r1,4,justified in saying,acc,0.333
31
+ anli_r1,4,median,accuracy,0.327
32
+ anli_r1,5,GPT-3 style,acc,0.321
33
+ anli_r1,5,MNLI crowdsource,acc,0.343
34
+ anli_r1,5,can we infer,acc,0.315
35
+ anli_r1,5,guaranteed/possible/impossible,acc,0.33
36
+ anli_r1,5,justified in saying,acc,0.333
37
+ anli_r1,5,median,accuracy,0.33
38
+ anli_r1,5,average,multiple,0.33416666666666667
39
+ anli_r2,0,GPT-3 style,acc,0.336
40
+ anli_r2,0,MNLI crowdsource,acc,0.334
41
+ anli_r2,0,can we infer,acc,0.336
42
+ anli_r2,0,guaranteed/possible/impossible,acc,0.325
43
+ anli_r2,0,justified in saying,acc,0.319
44
+ anli_r2,0,median,accuracy,0.334
45
+ anli_r2,1,GPT-3 style,acc,0.305
46
+ anli_r2,1,MNLI crowdsource,acc,0.315
47
+ anli_r2,1,can we infer,acc,0.312
48
+ anli_r2,1,guaranteed/possible/impossible,acc,0.313
49
+ anli_r2,1,justified in saying,acc,0.314
50
+ anli_r2,1,median,accuracy,0.313
51
+ anli_r2,2,GPT-3 style,acc,0.305
52
+ anli_r2,2,MNLI crowdsource,acc,0.336
53
+ anli_r2,2,can we infer,acc,0.332
54
+ anli_r2,2,guaranteed/possible/impossible,acc,0.328
55
+ anli_r2,2,justified in saying,acc,0.335
56
+ anli_r2,2,median,accuracy,0.332
57
+ anli_r2,3,GPT-3 style,acc,0.317
58
+ anli_r2,3,MNLI crowdsource,acc,0.311
59
+ anli_r2,3,can we infer,acc,0.333
60
+ anli_r2,3,guaranteed/possible/impossible,acc,0.335
61
+ anli_r2,3,justified in saying,acc,0.339
62
+ anli_r2,3,median,accuracy,0.333
63
+ anli_r2,4,GPT-3 style,acc,0.313
64
+ anli_r2,4,MNLI crowdsource,acc,0.323
65
+ anli_r2,4,can we infer,acc,0.317
66
+ anli_r2,4,guaranteed/possible/impossible,acc,0.34
67
+ anli_r2,4,justified in saying,acc,0.319
68
+ anli_r2,4,median,accuracy,0.319
69
+ anli_r2,5,GPT-3 style,acc,0.324
70
+ anli_r2,5,MNLI crowdsource,acc,0.338
71
+ anli_r2,5,can we infer,acc,0.327
72
+ anli_r2,5,guaranteed/possible/impossible,acc,0.337
73
+ anli_r2,5,justified in saying,acc,0.315
74
+ anli_r2,5,median,accuracy,0.327
75
+ anli_r2,5,average,multiple,0.32633333333333336
76
+ anli_r3,0,GPT-3 style,acc,0.3383333333333333
77
+ anli_r3,0,MNLI crowdsource,acc,0.33666666666666667
78
+ anli_r3,0,can we infer,acc,0.33916666666666667
79
+ anli_r3,0,guaranteed/possible/impossible,acc,0.2991666666666667
80
+ anli_r3,0,justified in saying,acc,0.3433333333333333
81
+ anli_r3,0,median,accuracy,0.3383333333333333
82
+ anli_r3,1,GPT-3 style,acc,0.3325
83
+ anli_r3,1,MNLI crowdsource,acc,0.3358333333333333
84
+ anli_r3,1,can we infer,acc,0.3408333333333333
85
+ anli_r3,1,guaranteed/possible/impossible,acc,0.33666666666666667
86
+ anli_r3,1,justified in saying,acc,0.33916666666666667
87
+ anli_r3,1,median,accuracy,0.33666666666666667
88
+ anli_r3,2,GPT-3 style,acc,0.32416666666666666
89
+ anli_r3,2,MNLI crowdsource,acc,0.32
90
+ anli_r3,2,can we infer,acc,0.31166666666666665
91
+ anli_r3,2,guaranteed/possible/impossible,acc,0.305
92
+ anli_r3,2,justified in saying,acc,0.30416666666666664
93
+ anli_r3,2,median,accuracy,0.31166666666666665
94
+ anli_r3,3,GPT-3 style,acc,0.3408333333333333
95
+ anli_r3,3,MNLI crowdsource,acc,0.35
96
+ anli_r3,3,can we infer,acc,0.3333333333333333
97
+ anli_r3,3,guaranteed/possible/impossible,acc,0.31916666666666665
98
+ anli_r3,3,justified in saying,acc,0.3441666666666667
99
+ anli_r3,3,median,accuracy,0.3408333333333333
100
+ anli_r3,4,GPT-3 style,acc,0.33166666666666667
101
+ anli_r3,4,MNLI crowdsource,acc,0.3275
102
+ anli_r3,4,can we infer,acc,0.3383333333333333
103
+ anli_r3,4,guaranteed/possible/impossible,acc,0.3375
104
+ anli_r3,4,justified in saying,acc,0.3358333333333333
105
+ anli_r3,4,median,accuracy,0.3358333333333333
106
+ anli_r3,5,GPT-3 style,acc,0.32166666666666666
107
+ anli_r3,5,MNLI crowdsource,acc,0.32
108
+ anli_r3,5,can we infer,acc,0.33666666666666667
109
+ anli_r3,5,guaranteed/possible/impossible,acc,0.32666666666666666
110
+ anli_r3,5,justified in saying,acc,0.32416666666666666
111
+ anli_r3,5,median,accuracy,0.32416666666666666
112
+ anli_r3,5,average,multiple,0.33125
113
+ arc_easy,0,heres_a_problem,acc,0.23890784982935154
114
+ arc_easy,0,i_am_hesitating,acc,0.3042929292929293
115
+ arc_easy,0,multiple_choice,acc,0.25715488215488214
116
+ arc_easy,0,pick_the_most_correct_option,acc,0.22866894197952217
117
+ arc_easy,0,qa_options,acc,0.2525597269624573
118
+ arc_easy,0,median,accuracy,0.2525597269624573
119
+ arc_easy,1,heres_a_problem,acc,0.2398989898989899
120
+ arc_easy,1,i_am_hesitating,acc,0.2627986348122867
121
+ arc_easy,1,multiple_choice,acc,0.2836700336700337
122
+ arc_easy,1,pick_the_most_correct_option,acc,0.23122866894197952
123
+ arc_easy,1,qa_options,acc,0.25426621160409557
124
+ arc_easy,1,median,accuracy,0.25426621160409557
125
+ arc_easy,2,heres_a_problem,acc,0.24494949494949494
126
+ arc_easy,2,i_am_hesitating,acc,0.2946127946127946
127
+ arc_easy,2,multiple_choice,acc,0.23293515358361774
128
+ arc_easy,2,pick_the_most_correct_option,acc,0.2354948805460751
129
+ arc_easy,2,qa_options,acc,0.31523569023569026
130
+ arc_easy,2,median,accuracy,0.24494949494949494
131
+ arc_easy,3,heres_a_problem,acc,0.25336700336700335
132
+ arc_easy,3,i_am_hesitating,acc,0.26791808873720135
133
+ arc_easy,3,multiple_choice,acc,0.2431740614334471
134
+ arc_easy,3,pick_the_most_correct_option,acc,0.24061433447098976
135
+ arc_easy,3,qa_options,acc,0.31734006734006737
136
+ arc_easy,3,median,accuracy,0.25336700336700335
137
+ arc_easy,4,heres_a_problem,acc,0.2380546075085324
138
+ arc_easy,4,i_am_hesitating,acc,0.29713804713804715
139
+ arc_easy,4,multiple_choice,acc,0.2908249158249158
140
+ arc_easy,4,pick_the_most_correct_option,acc,0.2361111111111111
141
+ arc_easy,4,qa_options,acc,0.26791808873720135
142
+ arc_easy,4,median,accuracy,0.26791808873720135
143
+ arc_easy,5,heres_a_problem,acc,0.2226962457337884
144
+ arc_easy,5,i_am_hesitating,acc,0.30303030303030304
145
+ arc_easy,5,multiple_choice,acc,0.2967171717171717
146
+ arc_easy,5,pick_the_most_correct_option,acc,0.24957912457912457
147
+ arc_easy,5,qa_options,acc,0.2619453924914676
148
+ arc_easy,5,median,accuracy,0.2619453924914676
149
+ arc_easy,5,average,multiple,0.2558343196852867
150
+ boolq,0,GPT-3 Style,acc,0.6163333333333333
151
+ boolq,0,after_reading,acc,0.622
152
+ boolq,0,exercise,acc,0.6236666666666667
153
+ boolq,0,valid_binary,acc,0.565
154
+ boolq,0,yes_no_question,acc,0.5426666666666666
155
+ boolq,0,median,accuracy,0.6163333333333333
156
+ boolq,1,GPT-3 Style,acc,0.596
157
+ boolq,1,after_reading,acc,0.546
158
+ boolq,1,exercise,acc,0.5566666666666666
159
+ boolq,1,valid_binary,acc,0.5693333333333334
160
+ boolq,1,yes_no_question,acc,0.5436666666666666
161
+ boolq,1,median,accuracy,0.5566666666666666
162
+ boolq,2,GPT-3 Style,acc,0.5923333333333334
163
+ boolq,2,after_reading,acc,0.5926666666666667
164
+ boolq,2,exercise,acc,0.576
165
+ boolq,2,valid_binary,acc,0.5973333333333334
166
+ boolq,2,yes_no_question,acc,0.562
167
+ boolq,2,median,accuracy,0.5923333333333334
168
+ boolq,3,GPT-3 Style,acc,0.6083333333333333
169
+ boolq,3,after_reading,acc,0.58
170
+ boolq,3,exercise,acc,0.5796666666666667
171
+ boolq,3,valid_binary,acc,0.5966666666666667
172
+ boolq,3,yes_no_question,acc,0.5646666666666667
173
+ boolq,3,median,accuracy,0.58
174
+ boolq,4,GPT-3 Style,acc,0.6136666666666667
175
+ boolq,4,after_reading,acc,0.5633333333333334
176
+ boolq,4,exercise,acc,0.593
177
+ boolq,4,valid_binary,acc,0.5913333333333334
178
+ boolq,4,yes_no_question,acc,0.5516666666666666
179
+ boolq,4,median,accuracy,0.5913333333333334
180
+ boolq,5,GPT-3 Style,acc,0.609
181
+ boolq,5,after_reading,acc,0.5546666666666666
182
+ boolq,5,exercise,acc,0.5896666666666667
183
+ boolq,5,valid_binary,acc,0.583
184
+ boolq,5,yes_no_question,acc,0.5483333333333333
185
+ boolq,5,median,accuracy,0.583
186
+ boolq,5,average,multiple,0.5866111111111111
187
+ cb,0,GPT-3 style,acc,0.4107142857142857
188
+ cb,0,MNLI crowdsource,acc,0.4107142857142857
189
+ cb,0,can we infer,acc,0.2857142857142857
190
+ cb,0,guaranteed/possible/impossible,acc,0.42857142857142855
191
+ cb,0,justified in saying,acc,0.19642857142857142
192
+ cb,0,median,accuracy,0.4107142857142857
193
+ cb,1,GPT-3 style,acc,0.39285714285714285
194
+ cb,1,MNLI crowdsource,acc,0.39285714285714285
195
+ cb,1,can we infer,acc,0.39285714285714285
196
+ cb,1,guaranteed/possible/impossible,acc,0.39285714285714285
197
+ cb,1,justified in saying,acc,0.44642857142857145
198
+ cb,1,median,accuracy,0.39285714285714285
199
+ cb,2,GPT-3 style,acc,0.42857142857142855
200
+ cb,2,MNLI crowdsource,acc,0.44642857142857145
201
+ cb,2,can we infer,acc,0.42857142857142855
202
+ cb,2,guaranteed/possible/impossible,acc,0.44642857142857145
203
+ cb,2,justified in saying,acc,0.42857142857142855
204
+ cb,2,median,accuracy,0.42857142857142855
205
+ cb,3,GPT-3 style,acc,0.39285714285714285
206
+ cb,3,MNLI crowdsource,acc,0.3392857142857143
207
+ cb,3,can we infer,acc,0.44642857142857145
208
+ cb,3,guaranteed/possible/impossible,acc,0.375
209
+ cb,3,justified in saying,acc,0.375
210
+ cb,3,median,accuracy,0.375
211
+ cb,4,GPT-3 style,acc,0.4107142857142857
212
+ cb,4,MNLI crowdsource,acc,0.39285714285714285
213
+ cb,4,can we infer,acc,0.42857142857142855
214
+ cb,4,guaranteed/possible/impossible,acc,0.5357142857142857
215
+ cb,4,justified in saying,acc,0.44642857142857145
216
+ cb,4,median,accuracy,0.42857142857142855
217
+ cb,5,GPT-3 style,acc,0.48214285714285715
218
+ cb,5,MNLI crowdsource,acc,0.4107142857142857
219
+ cb,5,can we infer,acc,0.375
220
+ cb,5,guaranteed/possible/impossible,acc,0.375
221
+ cb,5,justified in saying,acc,0.39285714285714285
222
+ cb,5,median,accuracy,0.39285714285714285
223
+ cb,5,average,multiple,0.40476190476190477
224
+ copa,0,best_option,acc,0.53
225
+ copa,0,cause_effect,acc,0.52
226
+ copa,0,choose,acc,0.49
227
+ copa,0,i_am_hesitating,acc,0.54
228
+ copa,0,plausible_alternatives,acc,0.53
229
+ copa,0,median,accuracy,0.53
230
+ copa,1,best_option,acc,0.59
231
+ copa,1,cause_effect,acc,0.46
232
+ copa,1,choose,acc,0.45
233
+ copa,1,i_am_hesitating,acc,0.45
234
+ copa,1,plausible_alternatives,acc,0.46
235
+ copa,1,median,accuracy,0.46
236
+ copa,2,best_option,acc,0.51
237
+ copa,2,cause_effect,acc,0.45
238
+ copa,2,choose,acc,0.45
239
+ copa,2,i_am_hesitating,acc,0.49
240
+ copa,2,plausible_alternatives,acc,0.46
241
+ copa,2,median,accuracy,0.46
242
+ copa,3,best_option,acc,0.55
243
+ copa,3,cause_effect,acc,0.47
244
+ copa,3,choose,acc,0.49
245
+ copa,3,i_am_hesitating,acc,0.48
246
+ copa,3,plausible_alternatives,acc,0.49
247
+ copa,3,median,accuracy,0.49
248
+ copa,4,best_option,acc,0.49
249
+ copa,4,cause_effect,acc,0.48
250
+ copa,4,choose,acc,0.51
251
+ copa,4,i_am_hesitating,acc,0.51
252
+ copa,4,plausible_alternatives,acc,0.48
253
+ copa,4,median,accuracy,0.49
254
+ copa,5,best_option,acc,0.54
255
+ copa,5,cause_effect,acc,0.51
256
+ copa,5,choose,acc,0.46
257
+ copa,5,i_am_hesitating,acc,0.51
258
+ copa,5,plausible_alternatives,acc,0.5
259
+ copa,5,median,accuracy,0.51
260
+ copa,5,average,multiple,0.49
261
+ e2e_nlg_cleaned,0,coherent_text,rouge2_fmeasure,0.15645061177192066
262
+ e2e_nlg_cleaned,0,create_text_for_me,rouge2_fmeasure,0.06347842363431547
263
+ e2e_nlg_cleaned,0,generate_gramatically_correct_text,rouge2_fmeasure,0.00012067093428409366
264
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.00024104025657346095
265
+ e2e_nlg_cleaned,0,text,rouge2_fmeasure,0.10910465326076894
266
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.06347842363431547
267
+ e2e_nlg_cleaned,1,coherent_text,rouge2_fmeasure,0.1870937559813721
268
+ e2e_nlg_cleaned,1,create_text_for_me,rouge2_fmeasure,0.16511209673657395
269
+ e2e_nlg_cleaned,1,generate_gramatically_correct_text,rouge2_fmeasure,0.025195913355673966
270
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.1714205638298909
271
+ e2e_nlg_cleaned,1,text,rouge2_fmeasure,0.20219167803744306
272
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1714205638298909
273
+ e2e_nlg_cleaned,2,coherent_text,rouge2_fmeasure,0.18600518275150685
274
+ e2e_nlg_cleaned,2,create_text_for_me,rouge2_fmeasure,0.17074360575215342
275
+ e2e_nlg_cleaned,2,generate_gramatically_correct_text,rouge2_fmeasure,0.04447784117945149
276
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.19259169221915515
277
+ e2e_nlg_cleaned,2,text,rouge2_fmeasure,0.19722529213201134
278
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.18600518275150685
279
+ e2e_nlg_cleaned,3,coherent_text,rouge2_fmeasure,0.18307097946148873
280
+ e2e_nlg_cleaned,3,create_text_for_me,rouge2_fmeasure,0.17213478001357976
281
+ e2e_nlg_cleaned,3,generate_gramatically_correct_text,rouge2_fmeasure,0.038284747118588126
282
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.19636018570824587
283
+ e2e_nlg_cleaned,3,text,rouge2_fmeasure,0.1964954395976402
284
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.18307097946148873
285
+ e2e_nlg_cleaned,4,coherent_text,rouge2_fmeasure,0.19134136835621748
286
+ e2e_nlg_cleaned,4,create_text_for_me,rouge2_fmeasure,0.17010384910521295
287
+ e2e_nlg_cleaned,4,generate_gramatically_correct_text,rouge2_fmeasure,0.037516989850184534
288
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.19590832872090894
289
+ e2e_nlg_cleaned,4,text,rouge2_fmeasure,0.19536984000862256
290
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.19134136835621748
291
+ e2e_nlg_cleaned,5,coherent_text,rouge2_fmeasure,0.18872128486346074
292
+ e2e_nlg_cleaned,5,create_text_for_me,rouge2_fmeasure,0.1683711858028947
293
+ e2e_nlg_cleaned,5,generate_gramatically_correct_text,rouge2_fmeasure,0.038242180726931196
294
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.19402158147865167
295
+ e2e_nlg_cleaned,5,text,rouge2_fmeasure,0.19119099944111612
296
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.18872128486346074
297
+ e2e_nlg_cleaned,5,average,multiple,0.16400630048281337
298
+ gem_xsum,0,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.014155568509608755
299
+ gem_xsum,0,DOC_tldr,rouge2_fmeasure,0.005848067139995684
300
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.01730052045113504
301
+ gem_xsum,0,summarize_DOC,rouge2_fmeasure,0.031013676801335422
302
+ gem_xsum,0,summarize_this_DOC_summary,rouge2_fmeasure,0.040900489822348056
303
+ gem_xsum,0,median,rouge2_fmeasure,0.01730052045113504
304
+ gem_xsum,1,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.020262527556005907
305
+ gem_xsum,1,DOC_tldr,rouge2_fmeasure,0.012072025290438592
306
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.019132118327200527
307
+ gem_xsum,1,summarize_DOC,rouge2_fmeasure,0.04334620232538617
308
+ gem_xsum,1,summarize_this_DOC_summary,rouge2_fmeasure,0.038774277981477374
309
+ gem_xsum,1,median,rouge2_fmeasure,0.020262527556005907
310
+ gem_xsum,2,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.02824595859604695
311
+ gem_xsum,2,DOC_tldr,rouge2_fmeasure,0.02751335673945438
312
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.026545543337132424
313
+ gem_xsum,2,summarize_DOC,rouge2_fmeasure,0.04362070001507444
314
+ gem_xsum,2,summarize_this_DOC_summary,rouge2_fmeasure,0.03664914264570665
315
+ gem_xsum,2,median,rouge2_fmeasure,0.02824595859604695
316
+ gem_xsum,3,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.02800561543388405
317
+ gem_xsum,3,DOC_tldr,rouge2_fmeasure,0.0402095932041227
318
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03291830334125208
319
+ gem_xsum,3,summarize_DOC,rouge2_fmeasure,0.0400453211123096
320
+ gem_xsum,3,summarize_this_DOC_summary,rouge2_fmeasure,0.03701973106444136
321
+ gem_xsum,3,median,rouge2_fmeasure,0.03701973106444136
322
+ gem_xsum,4,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.00666835063292078
323
+ gem_xsum,4,DOC_tldr,rouge2_fmeasure,0.010845224152235416
324
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010104068388385765
325
+ gem_xsum,4,summarize_DOC,rouge2_fmeasure,0.010522073701869125
326
+ gem_xsum,4,summarize_this_DOC_summary,rouge2_fmeasure,0.008786196844590121
327
+ gem_xsum,4,median,rouge2_fmeasure,0.010104068388385765
328
+ gem_xsum,5,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0
329
+ gem_xsum,5,DOC_tldr,rouge2_fmeasure,0.0003107051777238192
330
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00041371259854665804
331
+ gem_xsum,5,summarize_DOC,rouge2_fmeasure,0.00046275158053195667
332
+ gem_xsum,5,summarize_this_DOC_summary,rouge2_fmeasure,6.352836541515787e-05
333
+ gem_xsum,5,median,rouge2_fmeasure,0.0003107051777238192
334
+ gem_xsum,5,average,multiple,0.018873918538956473
335
+ piqa,0,Correct the solution,rouge2_fmeasure,0.09706102035374112
336
+ piqa,0,choose the most appropriate solution,acc,0.49510337323177367
337
+ piqa,0,no prompt needed,rouge2_fmeasure,0.005928136888518339
338
+ piqa,0,pick_correct_choice_index,acc,0.49510337323177367
339
+ piqa,0,what_is_the_correct_ending,acc,0.5565832426550599
340
+ piqa,0,median,accuracy,0.49510337323177367
341
+ piqa,1,Correct the solution,rouge2_fmeasure,0.16839814753926893
342
+ piqa,1,choose the most appropriate solution,acc,0.5087051142546246
343
+ piqa,1,no prompt needed,rouge2_fmeasure,0.005682715949708656
344
+ piqa,1,pick_correct_choice_index,acc,0.5076169749727966
345
+ piqa,1,what_is_the_correct_ending,acc,0.5685527747551686
346
+ piqa,1,median,accuracy,0.5087051142546246
347
+ piqa,2,Correct the solution,rouge2_fmeasure,0.21700191007059494
348
+ piqa,2,choose the most appropriate solution,acc,0.5223068552774756
349
+ piqa,2,no prompt needed,rouge2_fmeasure,0.005621916396892083
350
+ piqa,2,pick_correct_choice_index,acc,0.5
351
+ piqa,2,what_is_the_correct_ending,acc,0.5718171926006529
352
+ piqa,2,median,accuracy,0.5223068552774756
353
+ piqa,3,Correct the solution,rouge2_fmeasure,0.2220313726729203
354
+ piqa,3,choose the most appropriate solution,acc,0.5092491838955386
355
+ piqa,3,no prompt needed,rouge2_fmeasure,0.005486989401606149
356
+ piqa,3,pick_correct_choice_index,acc,0.515778019586507
357
+ piqa,3,what_is_the_correct_ending,acc,0.5663764961915125
358
+ piqa,3,median,accuracy,0.515778019586507
359
+ piqa,4,Correct the solution,rouge2_fmeasure,0.21583669822052345
360
+ piqa,4,choose the most appropriate solution,acc,0.5282916213275299
361
+ piqa,4,no prompt needed,rouge2_fmeasure,0.005250361302057742
362
+ piqa,4,pick_correct_choice_index,acc,0.5228509249183896
363
+ piqa,4,what_is_the_correct_ending,acc,0.5865070729053319
364
+ piqa,4,median,accuracy,0.5282916213275299
365
+ piqa,5,Correct the solution,rouge2_fmeasure,0.20868674330105244
366
+ piqa,5,choose the most appropriate solution,acc,0.5114254624591947
367
+ piqa,5,no prompt needed,rouge2_fmeasure,0.005515135528910162
368
+ piqa,5,pick_correct_choice_index,acc,0.5021762785636561
369
+ piqa,5,what_is_the_correct_ending,acc,0.5848748639825898
370
+ piqa,5,median,accuracy,0.5114254624591947
371
+ piqa,5,average,multiple,0.5136017410228509
372
+ sciq,0,Direct Question,acc,0.83
373
+ sciq,0,Direct Question (Closed Book),acc,0.613
374
+ sciq,0,Multiple Choice,acc,0.342
375
+ sciq,0,Multiple Choice (Closed Book),acc,0.287
376
+ sciq,0,Multiple Choice Question First,acc,0.349
377
+ sciq,0,median,accuracy,0.349
378
+ sciq,1,Direct Question,acc,0.846
379
+ sciq,1,Direct Question (Closed Book),acc,0.663
380
+ sciq,1,Multiple Choice,acc,0.378
381
+ sciq,1,Multiple Choice (Closed Book),acc,0.378
382
+ sciq,1,Multiple Choice Question First,acc,0.392
383
+ sciq,1,median,accuracy,0.392
384
+ sciq,2,Direct Question,acc,0.853
385
+ sciq,2,Direct Question (Closed Book),acc,0.673
386
+ sciq,2,Multiple Choice,acc,0.344
387
+ sciq,2,Multiple Choice (Closed Book),acc,0.372
388
+ sciq,2,Multiple Choice Question First,acc,0.363
389
+ sciq,2,median,accuracy,0.372
390
+ sciq,3,Direct Question,acc,0.856
391
+ sciq,3,Direct Question (Closed Book),acc,0.662
392
+ sciq,3,Multiple Choice,acc,0.329
393
+ sciq,3,Multiple Choice (Closed Book),acc,0.349
394
+ sciq,3,Multiple Choice Question First,acc,0.363
395
+ sciq,3,median,accuracy,0.363
396
+ sciq,4,Direct Question,acc,0.849
397
+ sciq,4,Direct Question (Closed Book),acc,0.671
398
+ sciq,4,Multiple Choice,acc,0.335
399
+ sciq,4,Multiple Choice (Closed Book),acc,0.335
400
+ sciq,4,Multiple Choice Question First,acc,0.319
401
+ sciq,4,median,accuracy,0.335
402
+ sciq,5,Direct Question,acc,0.849
403
+ sciq,5,Direct Question (Closed Book),acc,0.682
404
+ sciq,5,Multiple Choice,acc,0.327
405
+ sciq,5,Multiple Choice (Closed Book),acc,0.362
406
+ sciq,5,Multiple Choice Question First,acc,0.333
407
+ sciq,5,median,accuracy,0.362
408
+ sciq,5,average,multiple,0.36216666666666664
409
+ story_cloze_2016,0,Answer Given options,acc,0.4719401389631213
410
+ story_cloze_2016,0,Choose Story Ending,acc,0.4906467129877071
411
+ story_cloze_2016,0,Novel Correct Ending,acc,0.4831640833778728
412
+ story_cloze_2016,0,Story Continuation and Options,acc,0.49706039551042225
413
+ story_cloze_2016,0,median,accuracy,0.48690539818279
414
+ story_cloze_2016,1,Answer Given options,acc,0.4521646178514164
415
+ story_cloze_2016,1,Choose Story Ending,acc,0.4596472474612507
416
+ story_cloze_2016,1,Novel Correct Ending,acc,0.4494922501336184
417
+ story_cloze_2016,1,Story Continuation and Options,acc,0.46392303580972744
418
+ story_cloze_2016,1,median,accuracy,0.4559059326563335
419
+ story_cloze_2016,2,Answer Given options,acc,0.4510956707642972
420
+ story_cloze_2016,2,Choose Story Ending,acc,0.4623196151790486
421
+ story_cloze_2016,2,Novel Correct Ending,acc,0.4478888295029396
422
+ story_cloze_2016,2,Story Continuation and Options,acc,0.45911277391769106
423
+ story_cloze_2016,2,median,accuracy,0.4551042223409941
424
+ story_cloze_2016,3,Answer Given options,acc,0.4665954035275254
425
+ story_cloze_2016,3,Choose Story Ending,acc,0.45269909139497594
426
+ story_cloze_2016,3,Novel Correct Ending,acc,0.4494922501336184
427
+ story_cloze_2016,3,Story Continuation and Options,acc,0.4521646178514164
428
+ story_cloze_2016,3,median,accuracy,0.45243185462319613
429
+ story_cloze_2016,4,Answer Given options,acc,0.45537145911277394
430
+ story_cloze_2016,4,Choose Story Ending,acc,0.46125066809192944
431
+ story_cloze_2016,4,Novel Correct Ending,acc,0.44200962052378406
432
+ story_cloze_2016,4,Story Continuation and Options,acc,0.4510956707642972
433
+ story_cloze_2016,4,median,accuracy,0.45323356493853556
434
+ story_cloze_2016,5,Answer Given options,acc,0.4665954035275254
435
+ story_cloze_2016,5,Choose Story Ending,acc,0.467129877071085
436
+ story_cloze_2016,5,Novel Correct Ending,acc,0.45056119722073756
437
+ story_cloze_2016,5,Story Continuation and Options,acc,0.4665954035275254
438
+ story_cloze_2016,5,median,accuracy,0.4665954035275254
439
+ story_cloze_2016,5,average,multiple,0.46169606271156244
440
+ superglue_rte,0,GPT-3 style,acc,0.5270758122743683
441
+ superglue_rte,0,MNLI crowdsource,acc,0.5342960288808665
442
+ superglue_rte,0,does it follow that,acc,0.5270758122743683
443
+ superglue_rte,0,guaranteed true,acc,0.5054151624548736
444
+ superglue_rte,0,should assume,acc,0.5415162454873647
445
+ superglue_rte,0,median,accuracy,0.5270758122743683
446
+ superglue_rte,1,GPT-3 style,acc,0.4729241877256318
447
+ superglue_rte,1,MNLI crowdsource,acc,0.49097472924187724
448
+ superglue_rte,1,does it follow that,acc,0.49097472924187724
449
+ superglue_rte,1,guaranteed true,acc,0.49097472924187724
450
+ superglue_rte,1,should assume,acc,0.49097472924187724
451
+ superglue_rte,1,median,accuracy,0.49097472924187724
452
+ superglue_rte,2,GPT-3 style,acc,0.51985559566787
453
+ superglue_rte,2,MNLI crowdsource,acc,0.51985559566787
454
+ superglue_rte,2,does it follow that,acc,0.5090252707581228
455
+ superglue_rte,2,guaranteed true,acc,0.5270758122743683
456
+ superglue_rte,2,should assume,acc,0.5090252707581228
457
+ superglue_rte,2,median,accuracy,0.51985559566787
458
+ superglue_rte,3,GPT-3 style,acc,0.5090252707581228
459
+ superglue_rte,3,MNLI crowdsource,acc,0.49097472924187724
460
+ superglue_rte,3,does it follow that,acc,0.48375451263537905
461
+ superglue_rte,3,guaranteed true,acc,0.516245487364621
462
+ superglue_rte,3,should assume,acc,0.5018050541516246
463
+ superglue_rte,3,median,accuracy,0.5018050541516246
464
+ superglue_rte,4,GPT-3 style,acc,0.4620938628158845
465
+ superglue_rte,4,MNLI crowdsource,acc,0.48736462093862815
466
+ superglue_rte,4,does it follow that,acc,0.48014440433212996
467
+ superglue_rte,4,guaranteed true,acc,0.5090252707581228
468
+ superglue_rte,4,should assume,acc,0.48014440433212996
469
+ superglue_rte,4,median,accuracy,0.48014440433212996
470
+ superglue_rte,5,GPT-3 style,acc,0.4548736462093863
471
+ superglue_rte,5,MNLI crowdsource,acc,0.4693140794223827
472
+ superglue_rte,5,does it follow that,acc,0.4981949458483754
473
+ superglue_rte,5,guaranteed true,acc,0.4693140794223827
474
+ superglue_rte,5,should assume,acc,0.4729241877256318
475
+ superglue_rte,5,median,accuracy,0.4693140794223827
476
+ superglue_rte,5,average,multiple,0.4981949458483754
477
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.0532813862747049
478
+ web_nlg_en,0,explicit-graph-description2,rouge2_fmeasure,0.012985384177633208
479
+ web_nlg_en,0,implicit-graph-description,rouge2_fmeasure,0.0019179536475281184
480
+ web_nlg_en,0,non-explicit-description,rouge2_fmeasure,0.004150191718708099
481
+ web_nlg_en,0,very-explicit-description,rouge2_fmeasure,6.345797512857661e-05
482
+ web_nlg_en,0,median,rouge2_fmeasure,0.004150191718708099
483
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05368591058094131
484
+ web_nlg_en,1,explicit-graph-description2,rouge2_fmeasure,0.04640292562526275
485
+ web_nlg_en,1,implicit-graph-description,rouge2_fmeasure,0.025426464984268635
486
+ web_nlg_en,1,non-explicit-description,rouge2_fmeasure,0.054485621293343514
487
+ web_nlg_en,1,very-explicit-description,rouge2_fmeasure,0.051815361827752766
488
+ web_nlg_en,1,median,rouge2_fmeasure,0.051815361827752766
489
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05344291957030947
490
+ web_nlg_en,2,explicit-graph-description2,rouge2_fmeasure,0.07757017845101091
491
+ web_nlg_en,2,implicit-graph-description,rouge2_fmeasure,0.032591510976486694
492
+ web_nlg_en,2,non-explicit-description,rouge2_fmeasure,0.061645677874947354
493
+ web_nlg_en,2,very-explicit-description,rouge2_fmeasure,0.0538074768484528
494
+ web_nlg_en,2,median,rouge2_fmeasure,0.0538074768484528
495
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05368996382308088
496
+ web_nlg_en,3,explicit-graph-description2,rouge2_fmeasure,0.06808437331115559
497
+ web_nlg_en,3,implicit-graph-description,rouge2_fmeasure,0.036709719893509046
498
+ web_nlg_en,3,non-explicit-description,rouge2_fmeasure,0.06571024213935271
499
+ web_nlg_en,3,very-explicit-description,rouge2_fmeasure,0.0579898457998029
500
+ web_nlg_en,3,median,rouge2_fmeasure,0.0579898457998029
501
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.0515680827205002
502
+ web_nlg_en,4,explicit-graph-description2,rouge2_fmeasure,0.052457179235399276
503
+ web_nlg_en,4,implicit-graph-description,rouge2_fmeasure,0.036402813498665906
504
+ web_nlg_en,4,non-explicit-description,rouge2_fmeasure,0.06374220282296517
505
+ web_nlg_en,4,very-explicit-description,rouge2_fmeasure,0.05793797811823835
506
+ web_nlg_en,4,median,rouge2_fmeasure,0.052457179235399276
507
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05107734688924233
508
+ web_nlg_en,5,explicit-graph-description2,rouge2_fmeasure,0.045075512409701604
509
+ web_nlg_en,5,implicit-graph-description,rouge2_fmeasure,0.03482971179628577
510
+ web_nlg_en,5,non-explicit-description,rouge2_fmeasure,0.0639493149144395
511
+ web_nlg_en,5,very-explicit-description,rouge2_fmeasure,0.05600461944766409
512
+ web_nlg_en,5,median,rouge2_fmeasure,0.05107734688924233
513
+ web_nlg_en,5,average,multiple,0.04521623371989303
514
+ wiki_lingua_en,0,article_summary_en,rouge2_fmeasure,0.009594517812957653
515
+ wiki_lingua_en,0,rephrase_en,rouge2_fmeasure,0.003243321779952968
516
+ wiki_lingua_en,0,summarize_above_en,rouge2_fmeasure,0.0042667329498244436
517
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.002874313185982406
518
+ wiki_lingua_en,0,write_abstract_en,rouge2_fmeasure,0.0011035986294212138
519
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.003243321779952968
520
+ wiki_lingua_en,1,article_summary_en,rouge2_fmeasure,0.017846850141455827
521
+ wiki_lingua_en,1,rephrase_en,rouge2_fmeasure,0.010181112623842817
522
+ wiki_lingua_en,1,summarize_above_en,rouge2_fmeasure,0.00423567615381497
523
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.028190707681194575
524
+ wiki_lingua_en,1,write_abstract_en,rouge2_fmeasure,0.010646298254836605
525
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.010646298254836605
526
+ wiki_lingua_en,2,article_summary_en,rouge2_fmeasure,0.022535640055881916
527
+ wiki_lingua_en,2,rephrase_en,rouge2_fmeasure,0.02117387153026309
528
+ wiki_lingua_en,2,summarize_above_en,rouge2_fmeasure,0.004697153886380661
529
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04456119604899187
530
+ wiki_lingua_en,2,write_abstract_en,rouge2_fmeasure,0.02129586388647884
531
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.02129586388647884
532
+ wiki_lingua_en,3,article_summary_en,rouge2_fmeasure,0.021648290209856712
533
+ wiki_lingua_en,3,rephrase_en,rouge2_fmeasure,0.022197025590925616
534
+ wiki_lingua_en,3,summarize_above_en,rouge2_fmeasure,0.0040651974203171634
535
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.03887583188926559
536
+ wiki_lingua_en,3,write_abstract_en,rouge2_fmeasure,0.021617853652187335
537
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.021648290209856712
538
+ wiki_lingua_en,4,article_summary_en,rouge2_fmeasure,0.007536808369783641
539
+ wiki_lingua_en,4,rephrase_en,rouge2_fmeasure,0.008171833643724272
540
+ wiki_lingua_en,4,summarize_above_en,rouge2_fmeasure,0.0018254913452193152
541
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.013407675922368708
542
+ wiki_lingua_en,4,write_abstract_en,rouge2_fmeasure,0.00641884861944969
543
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.007536808369783641
544
+ wiki_lingua_en,5,article_summary_en,rouge2_fmeasure,0.0011309269927620334
545
+ wiki_lingua_en,5,rephrase_en,rouge2_fmeasure,0.0012168332924537228
546
+ wiki_lingua_en,5,summarize_above_en,rouge2_fmeasure,0.00026468573039586365
547
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0020845828252393957
548
+ wiki_lingua_en,5,write_abstract_en,rouge2_fmeasure,0.0006554801175224404
549
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0011309269927620334
550
+ wiki_lingua_en,5,average,multiple,0.010916918248945133
551
+ winogrande,0,Replace,acc,0.5059194948697711
552
+ winogrande,0,True or False,acc,0.494869771112865
553
+ winogrande,0,does underscore refer to,acc,0.4964483030781373
554
+ winogrande,0,stand for,acc,0.5098658247829518
555
+ winogrande,0,underscore refer to,acc,0.5177584846093133
556
+ winogrande,0,median,accuracy,0.5059194948697711
557
+ winogrande,1,Replace,acc,0.5114443567482242
558
+ winogrande,1,True or False,acc,0.494869771112865
559
+ winogrande,1,does underscore refer to,acc,0.49329123914759276
560
+ winogrande,1,stand for,acc,0.5090765588003157
561
+ winogrande,1,underscore refer to,acc,0.4964483030781373
562
+ winogrande,1,median,accuracy,0.4964483030781373
563
+ winogrande,2,Replace,acc,0.5043409629044988
564
+ winogrande,2,True or False,acc,0.49329123914759276
565
+ winogrande,2,does underscore refer to,acc,0.49171270718232046
566
+ winogrande,2,stand for,acc,0.49329123914759276
567
+ winogrande,2,underscore refer to,acc,0.5019731649565904
568
+ winogrande,2,median,accuracy,0.49329123914759276
569
+ winogrande,3,Replace,acc,0.5059194948697711
570
+ winogrande,3,True or False,acc,0.4988161010260458
571
+ winogrande,3,does underscore refer to,acc,0.48855564325177586
572
+ winogrande,3,stand for,acc,0.4980268350434096
573
+ winogrande,3,underscore refer to,acc,0.5209155485398579
574
+ winogrande,3,median,accuracy,0.4988161010260458
575
+ winogrande,4,Replace,acc,0.5019731649565904
576
+ winogrande,4,True or False,acc,0.5098658247829518
577
+ winogrande,4,does underscore refer to,acc,0.4877663772691397
578
+ winogrande,4,stand for,acc,0.4980268350434096
579
+ winogrande,4,underscore refer to,acc,0.5193370165745856
580
+ winogrande,4,median,accuracy,0.5019731649565904
581
+ winogrande,5,Replace,acc,0.4956590370955012
582
+ winogrande,5,True or False,acc,0.5019731649565904
583
+ winogrande,5,does underscore refer to,acc,0.4925019731649566
584
+ winogrande,5,stand for,acc,0.489344909234412
585
+ winogrande,5,underscore refer to,acc,0.5090765588003157
586
+ winogrande,5,median,accuracy,0.4956590370955012
587
+ winogrande,5,average,multiple,0.4986845566956064
4b284b12bc4/eval/merged.json ADDED
The diff for this file is too large to render. See raw diff
 
4b284b17bc4/eval/merged.csv ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ anli_r1,0,GPT-3 style,acc,0.334
3
+ anli_r1,0,MNLI crowdsource,acc,0.337
4
+ anli_r1,0,can we infer,acc,0.335
5
+ anli_r1,0,guaranteed/possible/impossible,acc,0.349
6
+ anli_r1,0,justified in saying,acc,0.339
7
+ anli_r1,0,median,accuracy,0.337
8
+ anli_r1,1,GPT-3 style,acc,0.324
9
+ anli_r1,1,MNLI crowdsource,acc,0.332
10
+ anli_r1,1,can we infer,acc,0.333
11
+ anli_r1,1,guaranteed/possible/impossible,acc,0.333
12
+ anli_r1,1,justified in saying,acc,0.332
13
+ anli_r1,1,median,accuracy,0.332
14
+ anli_r1,2,GPT-3 style,acc,0.352
15
+ anli_r1,2,MNLI crowdsource,acc,0.337
16
+ anli_r1,2,can we infer,acc,0.345
17
+ anli_r1,2,guaranteed/possible/impossible,acc,0.33
18
+ anli_r1,2,justified in saying,acc,0.356
19
+ anli_r1,2,median,accuracy,0.345
20
+ anli_r1,3,GPT-3 style,acc,0.343
21
+ anli_r1,3,MNLI crowdsource,acc,0.352
22
+ anli_r1,3,can we infer,acc,0.359
23
+ anli_r1,3,guaranteed/possible/impossible,acc,0.325
24
+ anli_r1,3,justified in saying,acc,0.35
25
+ anli_r1,3,median,accuracy,0.35
26
+ anli_r1,4,GPT-3 style,acc,0.338
27
+ anli_r1,4,MNLI crowdsource,acc,0.341
28
+ anli_r1,4,can we infer,acc,0.337
29
+ anli_r1,4,guaranteed/possible/impossible,acc,0.331
30
+ anli_r1,4,justified in saying,acc,0.328
31
+ anli_r1,4,median,accuracy,0.337
32
+ anli_r1,5,GPT-3 style,acc,0.348
33
+ anli_r1,5,MNLI crowdsource,acc,0.356
34
+ anli_r1,5,can we infer,acc,0.337
35
+ anli_r1,5,guaranteed/possible/impossible,acc,0.333
36
+ anli_r1,5,justified in saying,acc,0.327
37
+ anli_r1,5,median,accuracy,0.337
38
+ anli_r1,5,average,multiple,0.3396666666666667
39
+ anli_r2,0,GPT-3 style,acc,0.333
40
+ anli_r2,0,MNLI crowdsource,acc,0.325
41
+ anli_r2,0,can we infer,acc,0.332
42
+ anli_r2,0,guaranteed/possible/impossible,acc,0.311
43
+ anli_r2,0,justified in saying,acc,0.333
44
+ anli_r2,0,median,accuracy,0.332
45
+ anli_r2,1,GPT-3 style,acc,0.314
46
+ anli_r2,1,MNLI crowdsource,acc,0.319
47
+ anli_r2,1,can we infer,acc,0.315
48
+ anli_r2,1,guaranteed/possible/impossible,acc,0.315
49
+ anli_r2,1,justified in saying,acc,0.32
50
+ anli_r2,1,median,accuracy,0.315
51
+ anli_r2,2,GPT-3 style,acc,0.334
52
+ anli_r2,2,MNLI crowdsource,acc,0.339
53
+ anli_r2,2,can we infer,acc,0.323
54
+ anli_r2,2,guaranteed/possible/impossible,acc,0.335
55
+ anli_r2,2,justified in saying,acc,0.322
56
+ anli_r2,2,median,accuracy,0.334
57
+ anli_r2,3,GPT-3 style,acc,0.325
58
+ anli_r2,3,MNLI crowdsource,acc,0.314
59
+ anli_r2,3,can we infer,acc,0.321
60
+ anli_r2,3,guaranteed/possible/impossible,acc,0.335
61
+ anli_r2,3,justified in saying,acc,0.322
62
+ anli_r2,3,median,accuracy,0.322
63
+ anli_r2,4,GPT-3 style,acc,0.311
64
+ anli_r2,4,MNLI crowdsource,acc,0.303
65
+ anli_r2,4,can we infer,acc,0.332
66
+ anli_r2,4,guaranteed/possible/impossible,acc,0.333
67
+ anli_r2,4,justified in saying,acc,0.331
68
+ anli_r2,4,median,accuracy,0.331
69
+ anli_r2,5,GPT-3 style,acc,0.313
70
+ anli_r2,5,MNLI crowdsource,acc,0.305
71
+ anli_r2,5,can we infer,acc,0.326
72
+ anli_r2,5,guaranteed/possible/impossible,acc,0.333
73
+ anli_r2,5,justified in saying,acc,0.319
74
+ anli_r2,5,median,accuracy,0.319
75
+ anli_r2,5,average,multiple,0.3255
76
+ anli_r3,0,GPT-3 style,acc,0.33416666666666667
77
+ anli_r3,0,MNLI crowdsource,acc,0.33416666666666667
78
+ anli_r3,0,can we infer,acc,0.32666666666666666
79
+ anli_r3,0,guaranteed/possible/impossible,acc,0.31583333333333335
80
+ anli_r3,0,justified in saying,acc,0.3358333333333333
81
+ anli_r3,0,median,accuracy,0.33416666666666667
82
+ anli_r3,1,GPT-3 style,acc,0.3275
83
+ anli_r3,1,MNLI crowdsource,acc,0.3333333333333333
84
+ anli_r3,1,can we infer,acc,0.33666666666666667
85
+ anli_r3,1,guaranteed/possible/impossible,acc,0.33666666666666667
86
+ anli_r3,1,justified in saying,acc,0.33416666666666667
87
+ anli_r3,1,median,accuracy,0.33416666666666667
88
+ anli_r3,2,GPT-3 style,acc,0.33
89
+ anli_r3,2,MNLI crowdsource,acc,0.31583333333333335
90
+ anli_r3,2,can we infer,acc,0.31166666666666665
91
+ anli_r3,2,guaranteed/possible/impossible,acc,0.33166666666666667
92
+ anli_r3,2,justified in saying,acc,0.32166666666666666
93
+ anli_r3,2,median,accuracy,0.32166666666666666
94
+ anli_r3,3,GPT-3 style,acc,0.335
95
+ anli_r3,3,MNLI crowdsource,acc,0.3333333333333333
96
+ anli_r3,3,can we infer,acc,0.3333333333333333
97
+ anli_r3,3,guaranteed/possible/impossible,acc,0.32916666666666666
98
+ anli_r3,3,justified in saying,acc,0.3383333333333333
99
+ anli_r3,3,median,accuracy,0.3333333333333333
100
+ anli_r3,4,GPT-3 style,acc,0.31666666666666665
101
+ anli_r3,4,MNLI crowdsource,acc,0.31666666666666665
102
+ anli_r3,4,can we infer,acc,0.31916666666666665
103
+ anli_r3,4,guaranteed/possible/impossible,acc,0.3425
104
+ anli_r3,4,justified in saying,acc,0.3275
105
+ anli_r3,4,median,accuracy,0.31916666666666665
106
+ anli_r3,5,GPT-3 style,acc,0.3308333333333333
107
+ anli_r3,5,MNLI crowdsource,acc,0.315
108
+ anli_r3,5,can we infer,acc,0.3225
109
+ anli_r3,5,guaranteed/possible/impossible,acc,0.33416666666666667
110
+ anli_r3,5,justified in saying,acc,0.31833333333333336
111
+ anli_r3,5,median,accuracy,0.3225
112
+ anli_r3,5,average,multiple,0.3275
113
+ arc_easy,0,heres_a_problem,acc,0.23274410774410775
114
+ arc_easy,0,i_am_hesitating,acc,0.26706484641638223
115
+ arc_easy,0,multiple_choice,acc,0.2958754208754209
116
+ arc_easy,0,pick_the_most_correct_option,acc,0.2295221843003413
117
+ arc_easy,0,qa_options,acc,0.35269360269360267
118
+ arc_easy,0,median,accuracy,0.26706484641638223
119
+ arc_easy,1,heres_a_problem,acc,0.23208191126279865
120
+ arc_easy,1,i_am_hesitating,acc,0.2790102389078498
121
+ arc_easy,1,multiple_choice,acc,0.30303030303030304
122
+ arc_easy,1,pick_the_most_correct_option,acc,0.22440273037542663
123
+ arc_easy,1,qa_options,acc,0.26621160409556316
124
+ arc_easy,1,median,accuracy,0.26621160409556316
125
+ arc_easy,2,heres_a_problem,acc,0.22013651877133106
126
+ arc_easy,2,i_am_hesitating,acc,0.33207070707070707
127
+ arc_easy,2,multiple_choice,acc,0.2431740614334471
128
+ arc_easy,2,pick_the_most_correct_option,acc,0.21928327645051193
129
+ arc_easy,2,qa_options,acc,0.3409090909090909
130
+ arc_easy,2,median,accuracy,0.2431740614334471
131
+ arc_easy,3,heres_a_problem,acc,0.24368686868686867
132
+ arc_easy,3,i_am_hesitating,acc,0.2508532423208191
133
+ arc_easy,3,multiple_choice,acc,0.3202861952861953
134
+ arc_easy,3,pick_the_most_correct_option,acc,0.24494949494949494
135
+ arc_easy,3,qa_options,acc,0.26023890784982934
136
+ arc_easy,3,median,accuracy,0.2508532423208191
137
+ arc_easy,4,heres_a_problem,acc,0.23863636363636365
138
+ arc_easy,4,i_am_hesitating,acc,0.3400673400673401
139
+ arc_easy,4,multiple_choice,acc,0.30134680134680136
140
+ arc_easy,4,pick_the_most_correct_option,acc,0.2354948805460751
141
+ arc_easy,4,qa_options,acc,0.3287037037037037
142
+ arc_easy,4,median,accuracy,0.30134680134680136
143
+ arc_easy,5,heres_a_problem,acc,0.23208191126279865
144
+ arc_easy,5,i_am_hesitating,acc,0.335016835016835
145
+ arc_easy,5,multiple_choice,acc,0.24146757679180889
146
+ arc_easy,5,pick_the_most_correct_option,acc,0.23378839590443687
147
+ arc_easy,5,qa_options,acc,0.25170648464163825
148
+ arc_easy,5,median,accuracy,0.24146757679180889
149
+ arc_easy,5,average,multiple,0.26168635540080365
150
+ boolq,0,GPT-3 Style,acc,0.5496666666666666
151
+ boolq,0,after_reading,acc,0.6233333333333333
152
+ boolq,0,exercise,acc,0.6236666666666667
153
+ boolq,0,valid_binary,acc,0.611
154
+ boolq,0,yes_no_question,acc,0.606
155
+ boolq,0,median,accuracy,0.611
156
+ boolq,1,GPT-3 Style,acc,0.56
157
+ boolq,1,after_reading,acc,0.5856666666666667
158
+ boolq,1,exercise,acc,0.5576666666666666
159
+ boolq,1,valid_binary,acc,0.6203333333333333
160
+ boolq,1,yes_no_question,acc,0.5746666666666667
161
+ boolq,1,median,accuracy,0.5746666666666667
162
+ boolq,2,GPT-3 Style,acc,0.58
163
+ boolq,2,after_reading,acc,0.6053333333333333
164
+ boolq,2,exercise,acc,0.5663333333333334
165
+ boolq,2,valid_binary,acc,0.623
166
+ boolq,2,yes_no_question,acc,0.5926666666666667
167
+ boolq,2,median,accuracy,0.5926666666666667
168
+ boolq,3,GPT-3 Style,acc,0.5823333333333334
169
+ boolq,3,after_reading,acc,0.6026666666666667
170
+ boolq,3,exercise,acc,0.5706666666666667
171
+ boolq,3,valid_binary,acc,0.6233333333333333
172
+ boolq,3,yes_no_question,acc,0.595
173
+ boolq,3,median,accuracy,0.595
174
+ boolq,4,GPT-3 Style,acc,0.587
175
+ boolq,4,after_reading,acc,0.6043333333333333
176
+ boolq,4,exercise,acc,0.5726666666666667
177
+ boolq,4,valid_binary,acc,0.621
178
+ boolq,4,yes_no_question,acc,0.577
179
+ boolq,4,median,accuracy,0.587
180
+ boolq,5,GPT-3 Style,acc,0.5886666666666667
181
+ boolq,5,after_reading,acc,0.604
182
+ boolq,5,exercise,acc,0.5673333333333334
183
+ boolq,5,valid_binary,acc,0.6223333333333333
184
+ boolq,5,yes_no_question,acc,0.5723333333333334
185
+ boolq,5,median,accuracy,0.5886666666666667
186
+ boolq,5,average,multiple,0.5915
187
+ cb,0,GPT-3 style,acc,0.4107142857142857
188
+ cb,0,MNLI crowdsource,acc,0.42857142857142855
189
+ cb,0,can we infer,acc,0.375
190
+ cb,0,guaranteed/possible/impossible,acc,0.42857142857142855
191
+ cb,0,justified in saying,acc,0.42857142857142855
192
+ cb,0,median,accuracy,0.42857142857142855
193
+ cb,1,GPT-3 style,acc,0.3392857142857143
194
+ cb,1,MNLI crowdsource,acc,0.39285714285714285
195
+ cb,1,can we infer,acc,0.39285714285714285
196
+ cb,1,guaranteed/possible/impossible,acc,0.375
197
+ cb,1,justified in saying,acc,0.375
198
+ cb,1,median,accuracy,0.375
199
+ cb,2,GPT-3 style,acc,0.4107142857142857
200
+ cb,2,MNLI crowdsource,acc,0.375
201
+ cb,2,can we infer,acc,0.375
202
+ cb,2,guaranteed/possible/impossible,acc,0.16071428571428573
203
+ cb,2,justified in saying,acc,0.44642857142857145
204
+ cb,2,median,accuracy,0.375
205
+ cb,3,GPT-3 style,acc,0.4107142857142857
206
+ cb,3,MNLI crowdsource,acc,0.3392857142857143
207
+ cb,3,can we infer,acc,0.4642857142857143
208
+ cb,3,guaranteed/possible/impossible,acc,0.16071428571428573
209
+ cb,3,justified in saying,acc,0.4107142857142857
210
+ cb,3,median,accuracy,0.4107142857142857
211
+ cb,4,GPT-3 style,acc,0.35714285714285715
212
+ cb,4,MNLI crowdsource,acc,0.35714285714285715
213
+ cb,4,can we infer,acc,0.4642857142857143
214
+ cb,4,guaranteed/possible/impossible,acc,0.14285714285714285
215
+ cb,4,justified in saying,acc,0.48214285714285715
216
+ cb,4,median,accuracy,0.35714285714285715
217
+ cb,5,GPT-3 style,acc,0.35714285714285715
218
+ cb,5,MNLI crowdsource,acc,0.4107142857142857
219
+ cb,5,can we infer,acc,0.5
220
+ cb,5,guaranteed/possible/impossible,acc,0.16071428571428573
221
+ cb,5,justified in saying,acc,0.4107142857142857
222
+ cb,5,median,accuracy,0.4107142857142857
223
+ cb,5,average,multiple,0.39285714285714285
224
+ copa,0,best_option,acc,0.55
225
+ copa,0,cause_effect,acc,0.54
226
+ copa,0,choose,acc,0.57
227
+ copa,0,i_am_hesitating,acc,0.54
228
+ copa,0,plausible_alternatives,acc,0.57
229
+ copa,0,median,accuracy,0.55
230
+ copa,1,best_option,acc,0.48
231
+ copa,1,cause_effect,acc,0.44
232
+ copa,1,choose,acc,0.46
233
+ copa,1,i_am_hesitating,acc,0.46
234
+ copa,1,plausible_alternatives,acc,0.41
235
+ copa,1,median,accuracy,0.46
236
+ copa,2,best_option,acc,0.42
237
+ copa,2,cause_effect,acc,0.41
238
+ copa,2,choose,acc,0.4
239
+ copa,2,i_am_hesitating,acc,0.4
240
+ copa,2,plausible_alternatives,acc,0.39
241
+ copa,2,median,accuracy,0.4
242
+ copa,3,best_option,acc,0.46
243
+ copa,3,cause_effect,acc,0.42
244
+ copa,3,choose,acc,0.4
245
+ copa,3,i_am_hesitating,acc,0.42
246
+ copa,3,plausible_alternatives,acc,0.43
247
+ copa,3,median,accuracy,0.42
248
+ copa,4,best_option,acc,0.47
249
+ copa,4,cause_effect,acc,0.39
250
+ copa,4,choose,acc,0.46
251
+ copa,4,i_am_hesitating,acc,0.41
252
+ copa,4,plausible_alternatives,acc,0.43
253
+ copa,4,median,accuracy,0.43
254
+ copa,5,best_option,acc,0.46
255
+ copa,5,cause_effect,acc,0.43
256
+ copa,5,choose,acc,0.45
257
+ copa,5,i_am_hesitating,acc,0.41
258
+ copa,5,plausible_alternatives,acc,0.45
259
+ copa,5,median,accuracy,0.45
260
+ copa,5,average,multiple,0.45166666666666666
261
+ e2e_nlg_cleaned,0,coherent_text,rouge2_fmeasure,0.09892905722529392
262
+ e2e_nlg_cleaned,0,create_text_for_me,rouge2_fmeasure,0.02500994962430241
263
+ e2e_nlg_cleaned,0,generate_gramatically_correct_text,rouge2_fmeasure,0.0
264
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.004707141554710639
265
+ e2e_nlg_cleaned,0,text,rouge2_fmeasure,0.1073793884770636
266
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.02500994962430241
267
+ e2e_nlg_cleaned,1,coherent_text,rouge2_fmeasure,0.1571930586851638
268
+ e2e_nlg_cleaned,1,create_text_for_me,rouge2_fmeasure,0.16340881202208163
269
+ e2e_nlg_cleaned,1,generate_gramatically_correct_text,rouge2_fmeasure,0.030860375813935463
270
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.1486663277769484
271
+ e2e_nlg_cleaned,1,text,rouge2_fmeasure,0.19855266031915028
272
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1571930586851638
273
+ e2e_nlg_cleaned,2,coherent_text,rouge2_fmeasure,0.16543669174208636
274
+ e2e_nlg_cleaned,2,create_text_for_me,rouge2_fmeasure,0.17467867245016275
275
+ e2e_nlg_cleaned,2,generate_gramatically_correct_text,rouge2_fmeasure,0.062337691922640125
276
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.1507673483604289
277
+ e2e_nlg_cleaned,2,text,rouge2_fmeasure,0.20344292743727435
278
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.16543669174208636
279
+ e2e_nlg_cleaned,3,coherent_text,rouge2_fmeasure,0.1703052547809578
280
+ e2e_nlg_cleaned,3,create_text_for_me,rouge2_fmeasure,0.17648552604551038
281
+ e2e_nlg_cleaned,3,generate_gramatically_correct_text,rouge2_fmeasure,0.08500284986690841
282
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.158839720125521
283
+ e2e_nlg_cleaned,3,text,rouge2_fmeasure,0.20463135769763866
284
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.1703052547809578
285
+ e2e_nlg_cleaned,4,coherent_text,rouge2_fmeasure,0.1738253944796353
286
+ e2e_nlg_cleaned,4,create_text_for_me,rouge2_fmeasure,0.17525069265082474
287
+ e2e_nlg_cleaned,4,generate_gramatically_correct_text,rouge2_fmeasure,0.1009343907225879
288
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.16013598883167798
289
+ e2e_nlg_cleaned,4,text,rouge2_fmeasure,0.19812463968549573
290
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.1738253944796353
291
+ e2e_nlg_cleaned,5,coherent_text,rouge2_fmeasure,0.17329021394802077
292
+ e2e_nlg_cleaned,5,create_text_for_me,rouge2_fmeasure,0.17585670830781294
293
+ e2e_nlg_cleaned,5,generate_gramatically_correct_text,rouge2_fmeasure,0.11077316594795349
294
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.1648812739511937
295
+ e2e_nlg_cleaned,5,text,rouge2_fmeasure,0.1945014895681582
296
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.17329021394802077
297
+ e2e_nlg_cleaned,5,average,multiple,0.14417676054336107
298
+ gem_xsum,0,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0190566029197429
299
+ gem_xsum,0,DOC_tldr,rouge2_fmeasure,0.05349637631115593
300
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.050742967235947956
301
+ gem_xsum,0,summarize_DOC,rouge2_fmeasure,0.040126835769534804
302
+ gem_xsum,0,summarize_this_DOC_summary,rouge2_fmeasure,0.05124089074244038
303
+ gem_xsum,0,median,rouge2_fmeasure,0.050742967235947956
304
+ gem_xsum,1,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.01908469081931983
305
+ gem_xsum,1,DOC_tldr,rouge2_fmeasure,0.051542487477497304
306
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03956915695649403
307
+ gem_xsum,1,summarize_DOC,rouge2_fmeasure,0.04851112854401421
308
+ gem_xsum,1,summarize_this_DOC_summary,rouge2_fmeasure,0.03871722499957788
309
+ gem_xsum,1,median,rouge2_fmeasure,0.03956915695649403
310
+ gem_xsum,2,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.026322118720045605
311
+ gem_xsum,2,DOC_tldr,rouge2_fmeasure,0.05337831779753894
312
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.040730679478674064
313
+ gem_xsum,2,summarize_DOC,rouge2_fmeasure,0.04736923476037229
314
+ gem_xsum,2,summarize_this_DOC_summary,rouge2_fmeasure,0.038463882894735665
315
+ gem_xsum,2,median,rouge2_fmeasure,0.040730679478674064
316
+ gem_xsum,3,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.03227615271942288
317
+ gem_xsum,3,DOC_tldr,rouge2_fmeasure,0.052933674983345634
318
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03767895922224648
319
+ gem_xsum,3,summarize_DOC,rouge2_fmeasure,0.04353291741965738
320
+ gem_xsum,3,summarize_this_DOC_summary,rouge2_fmeasure,0.0361588906854937
321
+ gem_xsum,3,median,rouge2_fmeasure,0.03767895922224648
322
+ gem_xsum,4,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.00873416530365632
323
+ gem_xsum,4,DOC_tldr,rouge2_fmeasure,0.01418426797251855
324
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01035393012550112
325
+ gem_xsum,4,summarize_DOC,rouge2_fmeasure,0.011212198666180598
326
+ gem_xsum,4,summarize_this_DOC_summary,rouge2_fmeasure,0.009582916301059853
327
+ gem_xsum,4,median,rouge2_fmeasure,0.01035393012550112
328
+ gem_xsum,5,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0
329
+ gem_xsum,5,DOC_tldr,rouge2_fmeasure,0.000325473526945072
330
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0003555930988203656
331
+ gem_xsum,5,summarize_DOC,rouge2_fmeasure,0.0005008107704990395
332
+ gem_xsum,5,summarize_this_DOC_summary,rouge2_fmeasure,0.0
333
+ gem_xsum,5,median,rouge2_fmeasure,0.000325473526945072
334
+ gem_xsum,5,average,multiple,0.029900194424301453
335
+ piqa,0,Correct the solution,rouge2_fmeasure,0.15599026594193496
336
+ piqa,0,choose the most appropriate solution,acc,0.49455930359085964
337
+ piqa,0,no prompt needed,rouge2_fmeasure,0.005465981531797976
338
+ piqa,0,pick_correct_choice_index,acc,0.49510337323177367
339
+ piqa,0,what_is_the_correct_ending,acc,0.559847660500544
340
+ piqa,0,median,accuracy,0.49510337323177367
341
+ piqa,1,Correct the solution,rouge2_fmeasure,0.20124233569826713
342
+ piqa,1,choose the most appropriate solution,acc,0.5021762785636561
343
+ piqa,1,no prompt needed,rouge2_fmeasure,0.005423476681292554
344
+ piqa,1,pick_correct_choice_index,acc,0.49782372143634385
345
+ piqa,1,what_is_the_correct_ending,acc,0.5418933623503809
346
+ piqa,1,median,accuracy,0.5021762785636561
347
+ piqa,2,Correct the solution,rouge2_fmeasure,0.3238830207743833
348
+ piqa,2,choose the most appropriate solution,acc,0.514689880304679
349
+ piqa,2,no prompt needed,rouge2_fmeasure,0.004712505847751591
350
+ piqa,2,pick_correct_choice_index,acc,0.49347116430903154
351
+ piqa,2,what_is_the_correct_ending,acc,0.5321001088139282
352
+ piqa,2,median,accuracy,0.514689880304679
353
+ piqa,3,Correct the solution,rouge2_fmeasure,0.4057090348109076
354
+ piqa,3,choose the most appropriate solution,acc,0.5130576713819369
355
+ piqa,3,no prompt needed,rouge2_fmeasure,0.00470586424330017
356
+ piqa,3,pick_correct_choice_index,acc,0.4880304678998912
357
+ piqa,3,what_is_the_correct_ending,acc,0.5310119695321001
358
+ piqa,3,median,accuracy,0.5130576713819369
359
+ piqa,4,Correct the solution,rouge2_fmeasure,0.44072198270594876
360
+ piqa,4,choose the most appropriate solution,acc,0.5076169749727966
361
+ piqa,4,no prompt needed,rouge2_fmeasure,0.004310884060921524
362
+ piqa,4,pick_correct_choice_index,acc,0.5195865070729053
363
+ piqa,4,what_is_the_correct_ending,acc,0.5413492927094669
364
+ piqa,4,median,accuracy,0.5195865070729053
365
+ piqa,5,Correct the solution,rouge2_fmeasure,0.4560672630321141
366
+ piqa,5,choose the most appropriate solution,acc,0.5087051142546246
367
+ piqa,5,no prompt needed,rouge2_fmeasure,0.0046971364054093695
368
+ piqa,5,pick_correct_choice_index,acc,0.5076169749727966
369
+ piqa,5,what_is_the_correct_ending,acc,0.5386289445048966
370
+ piqa,5,median,accuracy,0.5087051142546246
371
+ piqa,5,average,multiple,0.5088864708015959
372
+ sciq,0,Direct Question,acc,0.876
373
+ sciq,0,Direct Question (Closed Book),acc,0.623
374
+ sciq,0,Multiple Choice,acc,0.6
375
+ sciq,0,Multiple Choice (Closed Book),acc,0.486
376
+ sciq,0,Multiple Choice Question First,acc,0.627
377
+ sciq,0,median,accuracy,0.623
378
+ sciq,1,Direct Question,acc,0.913
379
+ sciq,1,Direct Question (Closed Book),acc,0.698
380
+ sciq,1,Multiple Choice,acc,0.585
381
+ sciq,1,Multiple Choice (Closed Book),acc,0.517
382
+ sciq,1,Multiple Choice Question First,acc,0.51
383
+ sciq,1,median,accuracy,0.585
384
+ sciq,2,Direct Question,acc,0.914
385
+ sciq,2,Direct Question (Closed Book),acc,0.715
386
+ sciq,2,Multiple Choice,acc,0.608
387
+ sciq,2,Multiple Choice (Closed Book),acc,0.51
388
+ sciq,2,Multiple Choice Question First,acc,0.583
389
+ sciq,2,median,accuracy,0.608
390
+ sciq,3,Direct Question,acc,0.92
391
+ sciq,3,Direct Question (Closed Book),acc,0.71
392
+ sciq,3,Multiple Choice,acc,0.637
393
+ sciq,3,Multiple Choice (Closed Book),acc,0.529
394
+ sciq,3,Multiple Choice Question First,acc,0.595
395
+ sciq,3,median,accuracy,0.637
396
+ sciq,4,Direct Question,acc,0.922
397
+ sciq,4,Direct Question (Closed Book),acc,0.717
398
+ sciq,4,Multiple Choice,acc,0.62
399
+ sciq,4,Multiple Choice (Closed Book),acc,0.545
400
+ sciq,4,Multiple Choice Question First,acc,0.599
401
+ sciq,4,median,accuracy,0.62
402
+ sciq,5,Direct Question,acc,0.924
403
+ sciq,5,Direct Question (Closed Book),acc,0.727
404
+ sciq,5,Multiple Choice,acc,0.625
405
+ sciq,5,Multiple Choice (Closed Book),acc,0.547
406
+ sciq,5,Multiple Choice Question First,acc,0.585
407
+ sciq,5,median,accuracy,0.625
408
+ sciq,5,average,multiple,0.6163333333333333
409
+ story_cloze_2016,0,Answer Given options,acc,0.4778193479422769
410
+ story_cloze_2016,0,Choose Story Ending,acc,0.4890432923570283
411
+ story_cloze_2016,0,Novel Correct Ending,acc,0.4751469802244789
412
+ story_cloze_2016,0,Story Continuation and Options,acc,0.5114911811865313
413
+ story_cloze_2016,0,median,accuracy,0.4834313201496526
414
+ story_cloze_2016,1,Answer Given options,acc,0.4585783003741315
415
+ story_cloze_2016,1,Choose Story Ending,acc,0.46980224478888294
416
+ story_cloze_2016,1,Novel Correct Ending,acc,0.47995724211651525
417
+ story_cloze_2016,1,Story Continuation and Options,acc,0.48957776590058794
418
+ story_cloze_2016,1,median,accuracy,0.4748797434526991
419
+ story_cloze_2016,2,Answer Given options,acc,0.46980224478888294
420
+ story_cloze_2016,2,Choose Story Ending,acc,0.4660609299839658
421
+ story_cloze_2016,2,Novel Correct Ending,acc,0.4730090860502405
422
+ story_cloze_2016,2,Story Continuation and Options,acc,0.4949225013361839
423
+ story_cloze_2016,2,median,accuracy,0.47140566541956175
424
+ story_cloze_2016,3,Answer Given options,acc,0.46125066809192944
425
+ story_cloze_2016,3,Choose Story Ending,acc,0.47247461250668094
426
+ story_cloze_2016,3,Novel Correct Ending,acc,0.4655264564404062
427
+ story_cloze_2016,3,Story Continuation and Options,acc,0.4938535542490647
428
+ story_cloze_2016,3,median,accuracy,0.46900053447354356
429
+ story_cloze_2016,4,Answer Given options,acc,0.4436130411544629
430
+ story_cloze_2016,4,Choose Story Ending,acc,0.46392303580972744
431
+ story_cloze_2016,4,Novel Correct Ending,acc,0.46125066809192944
432
+ story_cloze_2016,4,Story Continuation and Options,acc,0.5077498663816141
433
+ story_cloze_2016,4,median,accuracy,0.46258685195082844
434
+ story_cloze_2016,5,Answer Given options,acc,0.4462854088722608
435
+ story_cloze_2016,5,Choose Story Ending,acc,0.4708711918760021
436
+ story_cloze_2016,5,Novel Correct Ending,acc,0.4462854088722608
437
+ story_cloze_2016,5,Story Continuation and Options,acc,0.4938535542490647
438
+ story_cloze_2016,5,median,accuracy,0.4585783003741315
439
+ story_cloze_2016,5,average,multiple,0.4699804026367362
440
+ superglue_rte,0,GPT-3 style,acc,0.5234657039711191
441
+ superglue_rte,0,MNLI crowdsource,acc,0.48014440433212996
442
+ superglue_rte,0,does it follow that,acc,0.48014440433212996
443
+ superglue_rte,0,guaranteed true,acc,0.49458483754512633
444
+ superglue_rte,0,should assume,acc,0.4981949458483754
445
+ superglue_rte,0,median,accuracy,0.49458483754512633
446
+ superglue_rte,1,GPT-3 style,acc,0.516245487364621
447
+ superglue_rte,1,MNLI crowdsource,acc,0.49097472924187724
448
+ superglue_rte,1,does it follow that,acc,0.49097472924187724
449
+ superglue_rte,1,guaranteed true,acc,0.49458483754512633
450
+ superglue_rte,1,should assume,acc,0.49097472924187724
451
+ superglue_rte,1,median,accuracy,0.49097472924187724
452
+ superglue_rte,2,GPT-3 style,acc,0.5270758122743683
453
+ superglue_rte,2,MNLI crowdsource,acc,0.5054151624548736
454
+ superglue_rte,2,does it follow that,acc,0.5126353790613718
455
+ superglue_rte,2,guaranteed true,acc,0.5090252707581228
456
+ superglue_rte,2,should assume,acc,0.5054151624548736
457
+ superglue_rte,2,median,accuracy,0.5090252707581228
458
+ superglue_rte,3,GPT-3 style,acc,0.5342960288808665
459
+ superglue_rte,3,MNLI crowdsource,acc,0.516245487364621
460
+ superglue_rte,3,does it follow that,acc,0.51985559566787
461
+ superglue_rte,3,guaranteed true,acc,0.5054151624548736
462
+ superglue_rte,3,should assume,acc,0.5306859205776173
463
+ superglue_rte,3,median,accuracy,0.51985559566787
464
+ superglue_rte,4,GPT-3 style,acc,0.555956678700361
465
+ superglue_rte,4,MNLI crowdsource,acc,0.5342960288808665
466
+ superglue_rte,4,does it follow that,acc,0.51985559566787
467
+ superglue_rte,4,guaranteed true,acc,0.5379061371841155
468
+ superglue_rte,4,should assume,acc,0.5523465703971119
469
+ superglue_rte,4,median,accuracy,0.5379061371841155
470
+ superglue_rte,5,GPT-3 style,acc,0.5667870036101083
471
+ superglue_rte,5,MNLI crowdsource,acc,0.5415162454873647
472
+ superglue_rte,5,does it follow that,acc,0.5379061371841155
473
+ superglue_rte,5,guaranteed true,acc,0.5270758122743683
474
+ superglue_rte,5,should assume,acc,0.51985559566787
475
+ superglue_rte,5,median,accuracy,0.5379061371841155
476
+ superglue_rte,5,average,multiple,0.5150421179302046
477
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05308201459552208
478
+ web_nlg_en,0,explicit-graph-description2,rouge2_fmeasure,0.0031079038623638374
479
+ web_nlg_en,0,implicit-graph-description,rouge2_fmeasure,0.004093823034187833
480
+ web_nlg_en,0,non-explicit-description,rouge2_fmeasure,0.0091292497595853
481
+ web_nlg_en,0,very-explicit-description,rouge2_fmeasure,0.004145988607839967
482
+ web_nlg_en,0,median,rouge2_fmeasure,0.004145988607839967
483
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.054620931903283015
484
+ web_nlg_en,1,explicit-graph-description2,rouge2_fmeasure,0.10582834218904164
485
+ web_nlg_en,1,implicit-graph-description,rouge2_fmeasure,0.048933388223398
486
+ web_nlg_en,1,non-explicit-description,rouge2_fmeasure,0.09345632716516253
487
+ web_nlg_en,1,very-explicit-description,rouge2_fmeasure,0.03326346351515616
488
+ web_nlg_en,1,median,rouge2_fmeasure,0.054620931903283015
489
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.04972618028817665
490
+ web_nlg_en,2,explicit-graph-description2,rouge2_fmeasure,0.18715015079380715
491
+ web_nlg_en,2,implicit-graph-description,rouge2_fmeasure,0.0664535256463577
492
+ web_nlg_en,2,non-explicit-description,rouge2_fmeasure,0.09362913479276391
493
+ web_nlg_en,2,very-explicit-description,rouge2_fmeasure,0.05216469490783778
494
+ web_nlg_en,2,median,rouge2_fmeasure,0.0664535256463577
495
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.038282775688304856
496
+ web_nlg_en,3,explicit-graph-description2,rouge2_fmeasure,0.18921046750479484
497
+ web_nlg_en,3,implicit-graph-description,rouge2_fmeasure,0.07150806038871549
498
+ web_nlg_en,3,non-explicit-description,rouge2_fmeasure,0.08163089196343765
499
+ web_nlg_en,3,very-explicit-description,rouge2_fmeasure,0.06259164725081237
500
+ web_nlg_en,3,median,rouge2_fmeasure,0.07150806038871549
501
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.035274756528572794
502
+ web_nlg_en,4,explicit-graph-description2,rouge2_fmeasure,0.1826637134726461
503
+ web_nlg_en,4,implicit-graph-description,rouge2_fmeasure,0.0756101747685517
504
+ web_nlg_en,4,non-explicit-description,rouge2_fmeasure,0.0843786630967621
505
+ web_nlg_en,4,very-explicit-description,rouge2_fmeasure,0.07984816777629551
506
+ web_nlg_en,4,median,rouge2_fmeasure,0.07984816777629551
507
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.04337652772461485
508
+ web_nlg_en,5,explicit-graph-description2,rouge2_fmeasure,0.1794175550575056
509
+ web_nlg_en,5,implicit-graph-description,rouge2_fmeasure,0.07734137021121697
510
+ web_nlg_en,5,non-explicit-description,rouge2_fmeasure,0.10182499744322755
511
+ web_nlg_en,5,very-explicit-description,rouge2_fmeasure,0.08392662509619025
512
+ web_nlg_en,5,median,rouge2_fmeasure,0.08392662509619025
513
+ web_nlg_en,5,average,multiple,0.06008388323644699
514
+ wiki_lingua_en,0,article_summary_en,rouge2_fmeasure,0.046175339585206684
515
+ wiki_lingua_en,0,rephrase_en,rouge2_fmeasure,0.011691902088528949
516
+ wiki_lingua_en,0,summarize_above_en,rouge2_fmeasure,0.013469176886654546
517
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.033925070200158246
518
+ wiki_lingua_en,0,write_abstract_en,rouge2_fmeasure,0.0012597496537706083
519
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.013469176886654546
520
+ wiki_lingua_en,1,article_summary_en,rouge2_fmeasure,0.040168134051973815
521
+ wiki_lingua_en,1,rephrase_en,rouge2_fmeasure,0.022783938421878452
522
+ wiki_lingua_en,1,summarize_above_en,rouge2_fmeasure,0.029442252351593513
523
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.053937307211284244
524
+ wiki_lingua_en,1,write_abstract_en,rouge2_fmeasure,0.02069013916427929
525
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.029442252351593513
526
+ wiki_lingua_en,2,article_summary_en,rouge2_fmeasure,0.04567156337896173
527
+ wiki_lingua_en,2,rephrase_en,rouge2_fmeasure,0.040384981669924526
528
+ wiki_lingua_en,2,summarize_above_en,rouge2_fmeasure,0.042503871652830684
529
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05625290668830642
530
+ wiki_lingua_en,2,write_abstract_en,rouge2_fmeasure,0.020465708060419606
531
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.042503871652830684
532
+ wiki_lingua_en,3,article_summary_en,rouge2_fmeasure,0.03899411935423918
533
+ wiki_lingua_en,3,rephrase_en,rouge2_fmeasure,0.03839113878587267
534
+ wiki_lingua_en,3,summarize_above_en,rouge2_fmeasure,0.03902103124926395
535
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04688317854067561
536
+ wiki_lingua_en,3,write_abstract_en,rouge2_fmeasure,0.01711652805678942
537
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.03899411935423918
538
+ wiki_lingua_en,4,article_summary_en,rouge2_fmeasure,0.012617379158558721
539
+ wiki_lingua_en,4,rephrase_en,rouge2_fmeasure,0.012281079448761325
540
+ wiki_lingua_en,4,summarize_above_en,rouge2_fmeasure,0.01084532274916111
541
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.013810868807903593
542
+ wiki_lingua_en,4,write_abstract_en,rouge2_fmeasure,0.0045440484711843825
543
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.012281079448761325
544
+ wiki_lingua_en,5,article_summary_en,rouge2_fmeasure,0.0019124752375349172
545
+ wiki_lingua_en,5,rephrase_en,rouge2_fmeasure,0.0017130160893721097
546
+ wiki_lingua_en,5,summarize_above_en,rouge2_fmeasure,0.0013869187743711499
547
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0024939038248004536
548
+ wiki_lingua_en,5,write_abstract_en,rouge2_fmeasure,0.00036639163657650276
549
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0017130160893721097
550
+ wiki_lingua_en,5,average,multiple,0.023067252630575227
551
+ winogrande,0,Replace,acc,0.5019731649565904
552
+ winogrande,0,True or False,acc,0.4956590370955012
553
+ winogrande,0,does underscore refer to,acc,0.4996053670086819
554
+ winogrande,0,stand for,acc,0.510655090765588
555
+ winogrande,0,underscore refer to,acc,0.5138121546961326
556
+ winogrande,0,median,accuracy,0.5019731649565904
557
+ winogrande,1,Replace,acc,0.5074980268350434
558
+ winogrande,1,True or False,acc,0.48855564325177586
559
+ winogrande,1,does underscore refer to,acc,0.4956590370955012
560
+ winogrande,1,stand for,acc,0.5035516969218626
561
+ winogrande,1,underscore refer to,acc,0.4972375690607735
562
+ winogrande,1,median,accuracy,0.4972375690607735
563
+ winogrande,2,Replace,acc,0.5090765588003157
564
+ winogrande,2,True or False,acc,0.4940805051302289
565
+ winogrande,2,does underscore refer to,acc,0.5011838989739542
566
+ winogrande,2,stand for,acc,0.4980268350434096
567
+ winogrande,2,underscore refer to,acc,0.5082872928176796
568
+ winogrande,2,median,accuracy,0.5011838989739542
569
+ winogrande,3,Replace,acc,0.5217048145224941
570
+ winogrande,3,True or False,acc,0.4996053670086819
571
+ winogrande,3,does underscore refer to,acc,0.5153906866614049
572
+ winogrande,3,stand for,acc,0.5035516969218626
573
+ winogrande,3,underscore refer to,acc,0.505130228887135
574
+ winogrande,3,median,accuracy,0.505130228887135
575
+ winogrande,4,Replace,acc,0.5224940805051302
576
+ winogrande,4,True or False,acc,0.5027624309392266
577
+ winogrande,4,does underscore refer to,acc,0.5098658247829518
578
+ winogrande,4,stand for,acc,0.5082872928176796
579
+ winogrande,4,underscore refer to,acc,0.5043409629044988
580
+ winogrande,4,median,accuracy,0.5082872928176796
581
+ winogrande,5,Replace,acc,0.5122336227308603
582
+ winogrande,5,True or False,acc,0.5035516969218626
583
+ winogrande,5,does underscore refer to,acc,0.5074980268350434
584
+ winogrande,5,stand for,acc,0.48382004735595896
585
+ winogrande,5,underscore refer to,acc,0.5098658247829518
586
+ winogrande,5,median,accuracy,0.5074980268350434
587
+ winogrande,5,average,multiple,0.5035516969218626
4b284b17bc4/eval/merged.json ADDED
The diff for this file is too large to render. See raw diff
 
4b284b21bc4/eval/merged.csv ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ anli_r1,0,GPT-3 style,acc,0.331
3
+ anli_r1,0,MNLI crowdsource,acc,0.333
4
+ anli_r1,0,can we infer,acc,0.358
5
+ anli_r1,0,guaranteed/possible/impossible,acc,0.327
6
+ anli_r1,0,justified in saying,acc,0.356
7
+ anli_r1,0,median,accuracy,0.333
8
+ anli_r1,1,GPT-3 style,acc,0.327
9
+ anli_r1,1,MNLI crowdsource,acc,0.333
10
+ anli_r1,1,can we infer,acc,0.333
11
+ anli_r1,1,guaranteed/possible/impossible,acc,0.332
12
+ anli_r1,1,justified in saying,acc,0.333
13
+ anli_r1,1,median,accuracy,0.333
14
+ anli_r1,2,GPT-3 style,acc,0.335
15
+ anli_r1,2,MNLI crowdsource,acc,0.358
16
+ anli_r1,2,can we infer,acc,0.361
17
+ anli_r1,2,guaranteed/possible/impossible,acc,0.329
18
+ anli_r1,2,justified in saying,acc,0.358
19
+ anli_r1,2,median,accuracy,0.358
20
+ anli_r1,3,GPT-3 style,acc,0.347
21
+ anli_r1,3,MNLI crowdsource,acc,0.358
22
+ anli_r1,3,can we infer,acc,0.35
23
+ anli_r1,3,guaranteed/possible/impossible,acc,0.328
24
+ anli_r1,3,justified in saying,acc,0.355
25
+ anli_r1,3,median,accuracy,0.35
26
+ anli_r1,4,GPT-3 style,acc,0.329
27
+ anli_r1,4,MNLI crowdsource,acc,0.354
28
+ anli_r1,4,can we infer,acc,0.344
29
+ anli_r1,4,guaranteed/possible/impossible,acc,0.328
30
+ anli_r1,4,justified in saying,acc,0.336
31
+ anli_r1,4,median,accuracy,0.336
32
+ anli_r1,5,GPT-3 style,acc,0.339
33
+ anli_r1,5,MNLI crowdsource,acc,0.345
34
+ anli_r1,5,can we infer,acc,0.329
35
+ anli_r1,5,guaranteed/possible/impossible,acc,0.331
36
+ anli_r1,5,justified in saying,acc,0.337
37
+ anli_r1,5,median,accuracy,0.337
38
+ anli_r1,5,average,multiple,0.3411666666666667
39
+ anli_r2,0,GPT-3 style,acc,0.334
40
+ anli_r2,0,MNLI crowdsource,acc,0.333
41
+ anli_r2,0,can we infer,acc,0.35
42
+ anli_r2,0,guaranteed/possible/impossible,acc,0.34
43
+ anli_r2,0,justified in saying,acc,0.339
44
+ anli_r2,0,median,accuracy,0.339
45
+ anli_r2,1,GPT-3 style,acc,0.313
46
+ anli_r2,1,MNLI crowdsource,acc,0.315
47
+ anli_r2,1,can we infer,acc,0.315
48
+ anli_r2,1,guaranteed/possible/impossible,acc,0.314
49
+ anli_r2,1,justified in saying,acc,0.315
50
+ anli_r2,1,median,accuracy,0.315
51
+ anli_r2,2,GPT-3 style,acc,0.333
52
+ anli_r2,2,MNLI crowdsource,acc,0.329
53
+ anli_r2,2,can we infer,acc,0.323
54
+ anli_r2,2,guaranteed/possible/impossible,acc,0.31
55
+ anli_r2,2,justified in saying,acc,0.323
56
+ anli_r2,2,median,accuracy,0.323
57
+ anli_r2,3,GPT-3 style,acc,0.337
58
+ anli_r2,3,MNLI crowdsource,acc,0.317
59
+ anli_r2,3,can we infer,acc,0.338
60
+ anli_r2,3,guaranteed/possible/impossible,acc,0.32
61
+ anli_r2,3,justified in saying,acc,0.329
62
+ anli_r2,3,median,accuracy,0.329
63
+ anli_r2,4,GPT-3 style,acc,0.336
64
+ anli_r2,4,MNLI crowdsource,acc,0.314
65
+ anli_r2,4,can we infer,acc,0.334
66
+ anli_r2,4,guaranteed/possible/impossible,acc,0.326
67
+ anli_r2,4,justified in saying,acc,0.329
68
+ anli_r2,4,median,accuracy,0.329
69
+ anli_r2,5,GPT-3 style,acc,0.342
70
+ anli_r2,5,MNLI crowdsource,acc,0.304
71
+ anli_r2,5,can we infer,acc,0.324
72
+ anli_r2,5,guaranteed/possible/impossible,acc,0.332
73
+ anli_r2,5,justified in saying,acc,0.317
74
+ anli_r2,5,median,accuracy,0.324
75
+ anli_r2,5,average,multiple,0.3265
76
+ anli_r3,0,GPT-3 style,acc,0.335
77
+ anli_r3,0,MNLI crowdsource,acc,0.3358333333333333
78
+ anli_r3,0,can we infer,acc,0.3333333333333333
79
+ anli_r3,0,guaranteed/possible/impossible,acc,0.32083333333333336
80
+ anli_r3,0,justified in saying,acc,0.3416666666666667
81
+ anli_r3,0,median,accuracy,0.335
82
+ anli_r3,1,GPT-3 style,acc,0.335
83
+ anli_r3,1,MNLI crowdsource,acc,0.33666666666666667
84
+ anli_r3,1,can we infer,acc,0.33666666666666667
85
+ anli_r3,1,guaranteed/possible/impossible,acc,0.3358333333333333
86
+ anli_r3,1,justified in saying,acc,0.33666666666666667
87
+ anli_r3,1,median,accuracy,0.33666666666666667
88
+ anli_r3,2,GPT-3 style,acc,0.33
89
+ anli_r3,2,MNLI crowdsource,acc,0.3233333333333333
90
+ anli_r3,2,can we infer,acc,0.325
91
+ anli_r3,2,guaranteed/possible/impossible,acc,0.32
92
+ anli_r3,2,justified in saying,acc,0.325
93
+ anli_r3,2,median,accuracy,0.325
94
+ anli_r3,3,GPT-3 style,acc,0.33166666666666667
95
+ anli_r3,3,MNLI crowdsource,acc,0.31833333333333336
96
+ anli_r3,3,can we infer,acc,0.3325
97
+ anli_r3,3,guaranteed/possible/impossible,acc,0.3308333333333333
98
+ anli_r3,3,justified in saying,acc,0.3408333333333333
99
+ anli_r3,3,median,accuracy,0.33166666666666667
100
+ anli_r3,4,GPT-3 style,acc,0.32666666666666666
101
+ anli_r3,4,MNLI crowdsource,acc,0.31583333333333335
102
+ anli_r3,4,can we infer,acc,0.31583333333333335
103
+ anli_r3,4,guaranteed/possible/impossible,acc,0.33666666666666667
104
+ anli_r3,4,justified in saying,acc,0.3175
105
+ anli_r3,4,median,accuracy,0.3175
106
+ anli_r3,5,GPT-3 style,acc,0.31916666666666665
107
+ anli_r3,5,MNLI crowdsource,acc,0.31
108
+ anli_r3,5,can we infer,acc,0.31166666666666665
109
+ anli_r3,5,guaranteed/possible/impossible,acc,0.3383333333333333
110
+ anli_r3,5,justified in saying,acc,0.30833333333333335
111
+ anli_r3,5,median,accuracy,0.31166666666666665
112
+ anli_r3,5,average,multiple,0.32625
113
+ arc_easy,0,heres_a_problem,acc,0.25
114
+ arc_easy,0,i_am_hesitating,acc,0.35395622895622897
115
+ arc_easy,0,multiple_choice,acc,0.23378839590443687
116
+ arc_easy,0,pick_the_most_correct_option,acc,0.24705387205387205
117
+ arc_easy,0,qa_options,acc,0.26023890784982934
118
+ arc_easy,0,median,accuracy,0.25
119
+ arc_easy,1,heres_a_problem,acc,0.24368686868686867
120
+ arc_easy,1,i_am_hesitating,acc,0.3468013468013468
121
+ arc_easy,1,multiple_choice,acc,0.3253367003367003
122
+ arc_easy,1,pick_the_most_correct_option,acc,0.2295221843003413
123
+ arc_easy,1,qa_options,acc,0.3425925925925926
124
+ arc_easy,1,median,accuracy,0.3253367003367003
125
+ arc_easy,2,heres_a_problem,acc,0.2508532423208191
126
+ arc_easy,2,i_am_hesitating,acc,0.3383838383838384
127
+ arc_easy,2,multiple_choice,acc,0.351010101010101
128
+ arc_easy,2,pick_the_most_correct_option,acc,0.24829351535836178
129
+ arc_easy,2,qa_options,acc,0.335016835016835
130
+ arc_easy,2,median,accuracy,0.335016835016835
131
+ arc_easy,3,heres_a_problem,acc,0.24915824915824916
132
+ arc_easy,3,i_am_hesitating,acc,0.25170648464163825
133
+ arc_easy,3,multiple_choice,acc,0.2380546075085324
134
+ arc_easy,3,pick_the_most_correct_option,acc,0.25170648464163825
135
+ arc_easy,3,qa_options,acc,0.3400673400673401
136
+ arc_easy,3,median,accuracy,0.25170648464163825
137
+ arc_easy,4,heres_a_problem,acc,0.24284511784511784
138
+ arc_easy,4,i_am_hesitating,acc,0.3480639730639731
139
+ arc_easy,4,multiple_choice,acc,0.24146757679180889
140
+ arc_easy,4,pick_the_most_correct_option,acc,0.24284511784511784
141
+ arc_easy,4,qa_options,acc,0.3367003367003367
142
+ arc_easy,4,median,accuracy,0.24284511784511784
143
+ arc_easy,5,heres_a_problem,acc,0.2431740614334471
144
+ arc_easy,5,i_am_hesitating,acc,0.33880471380471383
145
+ arc_easy,5,multiple_choice,acc,0.33796296296296297
146
+ arc_easy,5,pick_the_most_correct_option,acc,0.25
147
+ arc_easy,5,qa_options,acc,0.25170648464163825
148
+ arc_easy,5,median,accuracy,0.25170648464163825
149
+ arc_easy,5,average,multiple,0.2761019370803216
150
+ boolq,0,GPT-3 Style,acc,0.5143333333333333
151
+ boolq,0,after_reading,acc,0.6233333333333333
152
+ boolq,0,exercise,acc,0.6236666666666667
153
+ boolq,0,valid_binary,acc,0.5753333333333334
154
+ boolq,0,yes_no_question,acc,0.5276666666666666
155
+ boolq,0,median,accuracy,0.5753333333333334
156
+ boolq,1,GPT-3 Style,acc,0.493
157
+ boolq,1,after_reading,acc,0.546
158
+ boolq,1,exercise,acc,0.6096666666666667
159
+ boolq,1,valid_binary,acc,0.5676666666666667
160
+ boolq,1,yes_no_question,acc,0.5406666666666666
161
+ boolq,1,median,accuracy,0.546
162
+ boolq,2,GPT-3 Style,acc,0.5063333333333333
163
+ boolq,2,after_reading,acc,0.5836666666666667
164
+ boolq,2,exercise,acc,0.6033333333333334
165
+ boolq,2,valid_binary,acc,0.593
166
+ boolq,2,yes_no_question,acc,0.5303333333333333
167
+ boolq,2,median,accuracy,0.5836666666666667
168
+ boolq,3,GPT-3 Style,acc,0.528
169
+ boolq,3,after_reading,acc,0.6116666666666667
170
+ boolq,3,exercise,acc,0.6083333333333333
171
+ boolq,3,valid_binary,acc,0.6066666666666667
172
+ boolq,3,yes_no_question,acc,0.5283333333333333
173
+ boolq,3,median,accuracy,0.6066666666666667
174
+ boolq,4,GPT-3 Style,acc,0.531
175
+ boolq,4,after_reading,acc,0.6136666666666667
176
+ boolq,4,exercise,acc,0.6133333333333333
177
+ boolq,4,valid_binary,acc,0.614
178
+ boolq,4,yes_no_question,acc,0.5186666666666667
179
+ boolq,4,median,accuracy,0.6133333333333333
180
+ boolq,5,GPT-3 Style,acc,0.5486666666666666
181
+ boolq,5,after_reading,acc,0.6126666666666667
182
+ boolq,5,exercise,acc,0.6183333333333333
183
+ boolq,5,valid_binary,acc,0.6123333333333333
184
+ boolq,5,yes_no_question,acc,0.5196666666666667
185
+ boolq,5,median,accuracy,0.6123333333333333
186
+ boolq,5,average,multiple,0.5895555555555556
187
+ cb,0,GPT-3 style,acc,0.375
188
+ cb,0,MNLI crowdsource,acc,0.4107142857142857
189
+ cb,0,can we infer,acc,0.5357142857142857
190
+ cb,0,guaranteed/possible/impossible,acc,0.10714285714285714
191
+ cb,0,justified in saying,acc,0.5178571428571429
192
+ cb,0,median,accuracy,0.4107142857142857
193
+ cb,1,GPT-3 style,acc,0.375
194
+ cb,1,MNLI crowdsource,acc,0.39285714285714285
195
+ cb,1,can we infer,acc,0.39285714285714285
196
+ cb,1,guaranteed/possible/impossible,acc,0.375
197
+ cb,1,justified in saying,acc,0.39285714285714285
198
+ cb,1,median,accuracy,0.39285714285714285
199
+ cb,2,GPT-3 style,acc,0.35714285714285715
200
+ cb,2,MNLI crowdsource,acc,0.4642857142857143
201
+ cb,2,can we infer,acc,0.39285714285714285
202
+ cb,2,guaranteed/possible/impossible,acc,0.25
203
+ cb,2,justified in saying,acc,0.39285714285714285
204
+ cb,2,median,accuracy,0.39285714285714285
205
+ cb,3,GPT-3 style,acc,0.3392857142857143
206
+ cb,3,MNLI crowdsource,acc,0.4107142857142857
207
+ cb,3,can we infer,acc,0.39285714285714285
208
+ cb,3,guaranteed/possible/impossible,acc,0.14285714285714285
209
+ cb,3,justified in saying,acc,0.375
210
+ cb,3,median,accuracy,0.375
211
+ cb,4,GPT-3 style,acc,0.32142857142857145
212
+ cb,4,MNLI crowdsource,acc,0.42857142857142855
213
+ cb,4,can we infer,acc,0.44642857142857145
214
+ cb,4,guaranteed/possible/impossible,acc,0.10714285714285714
215
+ cb,4,justified in saying,acc,0.44642857142857145
216
+ cb,4,median,accuracy,0.42857142857142855
217
+ cb,5,GPT-3 style,acc,0.2857142857142857
218
+ cb,5,MNLI crowdsource,acc,0.4107142857142857
219
+ cb,5,can we infer,acc,0.44642857142857145
220
+ cb,5,guaranteed/possible/impossible,acc,0.14285714285714285
221
+ cb,5,justified in saying,acc,0.44642857142857145
222
+ cb,5,median,accuracy,0.4107142857142857
223
+ cb,5,average,multiple,0.4017857142857143
224
+ copa,0,best_option,acc,0.6
225
+ copa,0,cause_effect,acc,0.54
226
+ copa,0,choose,acc,0.58
227
+ copa,0,i_am_hesitating,acc,0.54
228
+ copa,0,plausible_alternatives,acc,0.54
229
+ copa,0,median,accuracy,0.54
230
+ copa,1,best_option,acc,0.53
231
+ copa,1,cause_effect,acc,0.42
232
+ copa,1,choose,acc,0.44
233
+ copa,1,i_am_hesitating,acc,0.43
234
+ copa,1,plausible_alternatives,acc,0.45
235
+ copa,1,median,accuracy,0.44
236
+ copa,2,best_option,acc,0.63
237
+ copa,2,cause_effect,acc,0.44
238
+ copa,2,choose,acc,0.4
239
+ copa,2,i_am_hesitating,acc,0.41
240
+ copa,2,plausible_alternatives,acc,0.42
241
+ copa,2,median,accuracy,0.42
242
+ copa,3,best_option,acc,0.6
243
+ copa,3,cause_effect,acc,0.44
244
+ copa,3,choose,acc,0.39
245
+ copa,3,i_am_hesitating,acc,0.44
246
+ copa,3,plausible_alternatives,acc,0.44
247
+ copa,3,median,accuracy,0.44
248
+ copa,4,best_option,acc,0.62
249
+ copa,4,cause_effect,acc,0.45
250
+ copa,4,choose,acc,0.41
251
+ copa,4,i_am_hesitating,acc,0.43
252
+ copa,4,plausible_alternatives,acc,0.44
253
+ copa,4,median,accuracy,0.44
254
+ copa,5,best_option,acc,0.58
255
+ copa,5,cause_effect,acc,0.47
256
+ copa,5,choose,acc,0.44
257
+ copa,5,i_am_hesitating,acc,0.48
258
+ copa,5,plausible_alternatives,acc,0.46
259
+ copa,5,median,accuracy,0.47
260
+ copa,5,average,multiple,0.4583333333333333
261
+ e2e_nlg_cleaned,0,coherent_text,rouge2_fmeasure,0.06946399430025461
262
+ e2e_nlg_cleaned,0,create_text_for_me,rouge2_fmeasure,0.022387020564367744
263
+ e2e_nlg_cleaned,0,generate_gramatically_correct_text,rouge2_fmeasure,0.0
264
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.06289750165250287
265
+ e2e_nlg_cleaned,0,text,rouge2_fmeasure,0.05547805696954945
266
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.05547805696954945
267
+ e2e_nlg_cleaned,1,coherent_text,rouge2_fmeasure,0.1605728054943973
268
+ e2e_nlg_cleaned,1,create_text_for_me,rouge2_fmeasure,0.15944824307809596
269
+ e2e_nlg_cleaned,1,generate_gramatically_correct_text,rouge2_fmeasure,0.028616212681722628
270
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.12449829406834531
271
+ e2e_nlg_cleaned,1,text,rouge2_fmeasure,0.19999728868621525
272
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.15944824307809596
273
+ e2e_nlg_cleaned,2,coherent_text,rouge2_fmeasure,0.177921888015793
274
+ e2e_nlg_cleaned,2,create_text_for_me,rouge2_fmeasure,0.17045094780052067
275
+ e2e_nlg_cleaned,2,generate_gramatically_correct_text,rouge2_fmeasure,0.07112804192230661
276
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.14591205568832014
277
+ e2e_nlg_cleaned,2,text,rouge2_fmeasure,0.19727207437417654
278
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.17045094780052067
279
+ e2e_nlg_cleaned,3,coherent_text,rouge2_fmeasure,0.1797184980766115
280
+ e2e_nlg_cleaned,3,create_text_for_me,rouge2_fmeasure,0.1725725568885113
281
+ e2e_nlg_cleaned,3,generate_gramatically_correct_text,rouge2_fmeasure,0.10309288089148716
282
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.14908085018598377
283
+ e2e_nlg_cleaned,3,text,rouge2_fmeasure,0.1964238350803286
284
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.1725725568885113
285
+ e2e_nlg_cleaned,4,coherent_text,rouge2_fmeasure,0.17842808573274627
286
+ e2e_nlg_cleaned,4,create_text_for_me,rouge2_fmeasure,0.1732811817482548
287
+ e2e_nlg_cleaned,4,generate_gramatically_correct_text,rouge2_fmeasure,0.1257055751954671
288
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.15250785714191883
289
+ e2e_nlg_cleaned,4,text,rouge2_fmeasure,0.19301601907405783
290
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.1732811817482548
291
+ e2e_nlg_cleaned,5,coherent_text,rouge2_fmeasure,0.17801029501851723
292
+ e2e_nlg_cleaned,5,create_text_for_me,rouge2_fmeasure,0.17347865411768018
293
+ e2e_nlg_cleaned,5,generate_gramatically_correct_text,rouge2_fmeasure,0.14176701175164574
294
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.15567663749325128
295
+ e2e_nlg_cleaned,5,text,rouge2_fmeasure,0.1925812091012645
296
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.17347865411768018
297
+ e2e_nlg_cleaned,5,average,multiple,0.1507849401004354
298
+ gem_xsum,0,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.021404604329843858
299
+ gem_xsum,0,DOC_tldr,rouge2_fmeasure,0.04267694660641669
300
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04639360161894793
301
+ gem_xsum,0,summarize_DOC,rouge2_fmeasure,0.037047223818116884
302
+ gem_xsum,0,summarize_this_DOC_summary,rouge2_fmeasure,0.048478969239621084
303
+ gem_xsum,0,median,rouge2_fmeasure,0.04267694660641669
304
+ gem_xsum,1,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.019363039161219217
305
+ gem_xsum,1,DOC_tldr,rouge2_fmeasure,0.046583919666909064
306
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.041352212313184845
307
+ gem_xsum,1,summarize_DOC,rouge2_fmeasure,0.04746052242779793
308
+ gem_xsum,1,summarize_this_DOC_summary,rouge2_fmeasure,0.03724797091171915
309
+ gem_xsum,1,median,rouge2_fmeasure,0.041352212313184845
310
+ gem_xsum,2,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.024834458548132245
311
+ gem_xsum,2,DOC_tldr,rouge2_fmeasure,0.05089418045053158
312
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04614704751240165
313
+ gem_xsum,2,summarize_DOC,rouge2_fmeasure,0.05137980859666271
314
+ gem_xsum,2,summarize_this_DOC_summary,rouge2_fmeasure,0.04154990869291492
315
+ gem_xsum,2,median,rouge2_fmeasure,0.04614704751240165
316
+ gem_xsum,3,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0308541164205269
317
+ gem_xsum,3,DOC_tldr,rouge2_fmeasure,0.05035265573315919
318
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04492583126972709
319
+ gem_xsum,3,summarize_DOC,rouge2_fmeasure,0.05134777699396957
320
+ gem_xsum,3,summarize_this_DOC_summary,rouge2_fmeasure,0.040167379424322004
321
+ gem_xsum,3,median,rouge2_fmeasure,0.04492583126972709
322
+ gem_xsum,4,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.008953373288301222
323
+ gem_xsum,4,DOC_tldr,rouge2_fmeasure,0.013792609834707413
324
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01203821663060757
325
+ gem_xsum,4,summarize_DOC,rouge2_fmeasure,0.012186294567403028
326
+ gem_xsum,4,summarize_this_DOC_summary,rouge2_fmeasure,0.010750703825901103
327
+ gem_xsum,4,median,rouge2_fmeasure,0.01203821663060757
328
+ gem_xsum,5,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0
329
+ gem_xsum,5,DOC_tldr,rouge2_fmeasure,0.0004211068403029026
330
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0003256051958251534
331
+ gem_xsum,5,summarize_DOC,rouge2_fmeasure,0.00026998206537521717
332
+ gem_xsum,5,summarize_this_DOC_summary,rouge2_fmeasure,0.0
333
+ gem_xsum,5,median,rouge2_fmeasure,0.00026998206537521717
334
+ gem_xsum,5,average,multiple,0.031235039399618844
335
+ piqa,0,Correct the solution,rouge2_fmeasure,0.20168519353681674
336
+ piqa,0,choose the most appropriate solution,acc,0.49510337323177367
337
+ piqa,0,no prompt needed,rouge2_fmeasure,0.005612668710216912
338
+ piqa,0,pick_correct_choice_index,acc,0.49510337323177367
339
+ piqa,0,what_is_the_correct_ending,acc,0.5609357997823722
340
+ piqa,0,median,accuracy,0.49510337323177367
341
+ piqa,1,Correct the solution,rouge2_fmeasure,0.16670313208229318
342
+ piqa,1,choose the most appropriate solution,acc,0.49510337323177367
343
+ piqa,1,no prompt needed,rouge2_fmeasure,0.005467793325653137
344
+ piqa,1,pick_correct_choice_index,acc,0.4967355821545158
345
+ piqa,1,what_is_the_correct_ending,acc,0.5680087051142546
346
+ piqa,1,median,accuracy,0.4967355821545158
347
+ piqa,2,Correct the solution,rouge2_fmeasure,0.1635128839739126
348
+ piqa,2,choose the most appropriate solution,acc,0.5108813928182807
349
+ piqa,2,no prompt needed,rouge2_fmeasure,0.004589484275527073
350
+ piqa,2,pick_correct_choice_index,acc,0.4836779107725789
351
+ piqa,2,what_is_the_correct_ending,acc,0.5516866158868335
352
+ piqa,2,median,accuracy,0.5108813928182807
353
+ piqa,3,Correct the solution,rouge2_fmeasure,0.16228271943343794
354
+ piqa,3,choose the most appropriate solution,acc,0.5016322089227421
355
+ piqa,3,no prompt needed,rouge2_fmeasure,0.004446131485933507
356
+ piqa,3,pick_correct_choice_index,acc,0.4766050054406964
357
+ piqa,3,what_is_the_correct_ending,acc,0.5625680087051143
358
+ piqa,3,median,accuracy,0.5016322089227421
359
+ piqa,4,Correct the solution,rouge2_fmeasure,0.17392590467314775
360
+ piqa,4,choose the most appropriate solution,acc,0.5021762785636561
361
+ piqa,4,no prompt needed,rouge2_fmeasure,0.004018163648220329
362
+ piqa,4,pick_correct_choice_index,acc,0.4896626768226333
363
+ piqa,4,what_is_the_correct_ending,acc,0.5489662676822633
364
+ piqa,4,median,accuracy,0.5021762785636561
365
+ piqa,5,Correct the solution,rouge2_fmeasure,0.18898088840112187
366
+ piqa,5,choose the most appropriate solution,acc,0.499455930359086
367
+ piqa,5,no prompt needed,rouge2_fmeasure,0.0037514567534632703
368
+ piqa,5,pick_correct_choice_index,acc,0.4885745375408052
369
+ piqa,5,what_is_the_correct_ending,acc,0.5576713819368879
370
+ piqa,5,median,accuracy,0.499455930359086
371
+ piqa,5,average,multiple,0.5009974610083424
372
+ sciq,0,Direct Question,acc,0.862
373
+ sciq,0,Direct Question (Closed Book),acc,0.498
374
+ sciq,0,Multiple Choice,acc,0.569
375
+ sciq,0,Multiple Choice (Closed Book),acc,0.422
376
+ sciq,0,Multiple Choice Question First,acc,0.571
377
+ sciq,0,median,accuracy,0.569
378
+ sciq,1,Direct Question,acc,0.896
379
+ sciq,1,Direct Question (Closed Book),acc,0.65
380
+ sciq,1,Multiple Choice,acc,0.55
381
+ sciq,1,Multiple Choice (Closed Book),acc,0.43
382
+ sciq,1,Multiple Choice Question First,acc,0.427
383
+ sciq,1,median,accuracy,0.55
384
+ sciq,2,Direct Question,acc,0.917
385
+ sciq,2,Direct Question (Closed Book),acc,0.664
386
+ sciq,2,Multiple Choice,acc,0.565
387
+ sciq,2,Multiple Choice (Closed Book),acc,0.441
388
+ sciq,2,Multiple Choice Question First,acc,0.431
389
+ sciq,2,median,accuracy,0.565
390
+ sciq,3,Direct Question,acc,0.921
391
+ sciq,3,Direct Question (Closed Book),acc,0.681
392
+ sciq,3,Multiple Choice,acc,0.571
393
+ sciq,3,Multiple Choice (Closed Book),acc,0.481
394
+ sciq,3,Multiple Choice Question First,acc,0.441
395
+ sciq,3,median,accuracy,0.571
396
+ sciq,4,Direct Question,acc,0.918
397
+ sciq,4,Direct Question (Closed Book),acc,0.686
398
+ sciq,4,Multiple Choice,acc,0.588
399
+ sciq,4,Multiple Choice (Closed Book),acc,0.501
400
+ sciq,4,Multiple Choice Question First,acc,0.448
401
+ sciq,4,median,accuracy,0.588
402
+ sciq,5,Direct Question,acc,0.923
403
+ sciq,5,Direct Question (Closed Book),acc,0.708
404
+ sciq,5,Multiple Choice,acc,0.599
405
+ sciq,5,Multiple Choice (Closed Book),acc,0.524
406
+ sciq,5,Multiple Choice Question First,acc,0.451
407
+ sciq,5,median,accuracy,0.599
408
+ sciq,5,average,multiple,0.5736666666666667
409
+ story_cloze_2016,0,Answer Given options,acc,0.4719401389631213
410
+ story_cloze_2016,0,Choose Story Ending,acc,0.484233030464992
411
+ story_cloze_2016,0,Novel Correct Ending,acc,0.48583645109567075
412
+ story_cloze_2016,0,Story Continuation and Options,acc,0.4804917156600748
413
+ story_cloze_2016,0,median,accuracy,0.4823623730625334
414
+ story_cloze_2016,1,Answer Given options,acc,0.4730090860502405
415
+ story_cloze_2016,1,Choose Story Ending,acc,0.4794227685729556
416
+ story_cloze_2016,1,Novel Correct Ending,acc,0.47835382148583644
417
+ story_cloze_2016,1,Story Continuation and Options,acc,0.4681988241582042
418
+ story_cloze_2016,1,median,accuracy,0.4756814537680385
419
+ story_cloze_2016,2,Answer Given options,acc,0.46018172100481025
420
+ story_cloze_2016,2,Choose Story Ending,acc,0.4596472474612507
421
+ story_cloze_2016,2,Novel Correct Ending,acc,0.47140566541956175
422
+ story_cloze_2016,2,Story Continuation and Options,acc,0.4494922501336184
423
+ story_cloze_2016,2,median,accuracy,0.4599144842330305
424
+ story_cloze_2016,3,Answer Given options,acc,0.46178514163548906
425
+ story_cloze_2016,3,Choose Story Ending,acc,0.46873329770176375
426
+ story_cloze_2016,3,Novel Correct Ending,acc,0.4607161945483699
427
+ story_cloze_2016,3,Story Continuation and Options,acc,0.4580438268305719
428
+ story_cloze_2016,3,median,accuracy,0.4612506680919295
429
+ story_cloze_2016,4,Answer Given options,acc,0.4607161945483699
430
+ story_cloze_2016,4,Choose Story Ending,acc,0.46018172100481025
431
+ story_cloze_2016,4,Novel Correct Ending,acc,0.4537680384820951
432
+ story_cloze_2016,4,Story Continuation and Options,acc,0.4569748797434527
433
+ story_cloze_2016,4,median,accuracy,0.4585783003741315
434
+ story_cloze_2016,5,Answer Given options,acc,0.467129877071085
435
+ story_cloze_2016,5,Choose Story Ending,acc,0.4580438268305719
436
+ story_cloze_2016,5,Novel Correct Ending,acc,0.4548369855692143
437
+ story_cloze_2016,5,Story Continuation and Options,acc,0.45056119722073756
438
+ story_cloze_2016,5,median,accuracy,0.4564404061998931
439
+ story_cloze_2016,5,average,multiple,0.46570461428825943
440
+ superglue_rte,0,GPT-3 style,acc,0.516245487364621
441
+ superglue_rte,0,MNLI crowdsource,acc,0.48375451263537905
442
+ superglue_rte,0,does it follow that,acc,0.48375451263537905
443
+ superglue_rte,0,guaranteed true,acc,0.5379061371841155
444
+ superglue_rte,0,should assume,acc,0.5018050541516246
445
+ superglue_rte,0,median,accuracy,0.5018050541516246
446
+ superglue_rte,1,GPT-3 style,acc,0.51985559566787
447
+ superglue_rte,1,MNLI crowdsource,acc,0.49097472924187724
448
+ superglue_rte,1,does it follow that,acc,0.49097472924187724
449
+ superglue_rte,1,guaranteed true,acc,0.49097472924187724
450
+ superglue_rte,1,should assume,acc,0.49097472924187724
451
+ superglue_rte,1,median,accuracy,0.49097472924187724
452
+ superglue_rte,2,GPT-3 style,acc,0.51985559566787
453
+ superglue_rte,2,MNLI crowdsource,acc,0.5018050541516246
454
+ superglue_rte,2,does it follow that,acc,0.51985559566787
455
+ superglue_rte,2,guaranteed true,acc,0.5018050541516246
456
+ superglue_rte,2,should assume,acc,0.5090252707581228
457
+ superglue_rte,2,median,accuracy,0.5090252707581228
458
+ superglue_rte,3,GPT-3 style,acc,0.5234657039711191
459
+ superglue_rte,3,MNLI crowdsource,acc,0.49458483754512633
460
+ superglue_rte,3,does it follow that,acc,0.516245487364621
461
+ superglue_rte,3,guaranteed true,acc,0.516245487364621
462
+ superglue_rte,3,should assume,acc,0.5270758122743683
463
+ superglue_rte,3,median,accuracy,0.516245487364621
464
+ superglue_rte,4,GPT-3 style,acc,0.5234657039711191
465
+ superglue_rte,4,MNLI crowdsource,acc,0.4584837545126354
466
+ superglue_rte,4,does it follow that,acc,0.516245487364621
467
+ superglue_rte,4,guaranteed true,acc,0.49458483754512633
468
+ superglue_rte,4,should assume,acc,0.516245487364621
469
+ superglue_rte,4,median,accuracy,0.516245487364621
470
+ superglue_rte,5,GPT-3 style,acc,0.5270758122743683
471
+ superglue_rte,5,MNLI crowdsource,acc,0.44765342960288806
472
+ superglue_rte,5,does it follow that,acc,0.4981949458483754
473
+ superglue_rte,5,guaranteed true,acc,0.47653429602888087
474
+ superglue_rte,5,should assume,acc,0.51985559566787
475
+ superglue_rte,5,median,accuracy,0.4981949458483754
476
+ superglue_rte,5,average,multiple,0.5054151624548736
477
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04998894903569846
478
+ web_nlg_en,0,explicit-graph-description2,rouge2_fmeasure,0.008447632785522565
479
+ web_nlg_en,0,implicit-graph-description,rouge2_fmeasure,0.004552592244101363
480
+ web_nlg_en,0,non-explicit-description,rouge2_fmeasure,0.012155444595988949
481
+ web_nlg_en,0,very-explicit-description,rouge2_fmeasure,1.477135016534563e-05
482
+ web_nlg_en,0,median,rouge2_fmeasure,0.008447632785522565
483
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.051345397484036256
484
+ web_nlg_en,1,explicit-graph-description2,rouge2_fmeasure,0.09334851918279623
485
+ web_nlg_en,1,implicit-graph-description,rouge2_fmeasure,0.057734516186082864
486
+ web_nlg_en,1,non-explicit-description,rouge2_fmeasure,0.12050338190039989
487
+ web_nlg_en,1,very-explicit-description,rouge2_fmeasure,0.08099215111185744
488
+ web_nlg_en,1,median,rouge2_fmeasure,0.08099215111185744
489
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.053828506115298144
490
+ web_nlg_en,2,explicit-graph-description2,rouge2_fmeasure,0.23334399501205214
491
+ web_nlg_en,2,implicit-graph-description,rouge2_fmeasure,0.07451843059126674
492
+ web_nlg_en,2,non-explicit-description,rouge2_fmeasure,0.13465046885050352
493
+ web_nlg_en,2,very-explicit-description,rouge2_fmeasure,0.11449533899405696
494
+ web_nlg_en,2,median,rouge2_fmeasure,0.11449533899405696
495
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.051724489676439236
496
+ web_nlg_en,3,explicit-graph-description2,rouge2_fmeasure,0.2613220581162864
497
+ web_nlg_en,3,implicit-graph-description,rouge2_fmeasure,0.08217622838318879
498
+ web_nlg_en,3,non-explicit-description,rouge2_fmeasure,0.14344396360996736
499
+ web_nlg_en,3,very-explicit-description,rouge2_fmeasure,0.11630716134029079
500
+ web_nlg_en,3,median,rouge2_fmeasure,0.11630716134029079
501
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.052942763106877684
502
+ web_nlg_en,4,explicit-graph-description2,rouge2_fmeasure,0.2637473868679139
503
+ web_nlg_en,4,implicit-graph-description,rouge2_fmeasure,0.08649984259029919
504
+ web_nlg_en,4,non-explicit-description,rouge2_fmeasure,0.14528102357273256
505
+ web_nlg_en,4,very-explicit-description,rouge2_fmeasure,0.11317180592447942
506
+ web_nlg_en,4,median,rouge2_fmeasure,0.11317180592447942
507
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.054089458597439195
508
+ web_nlg_en,5,explicit-graph-description2,rouge2_fmeasure,0.2771416755952798
509
+ web_nlg_en,5,implicit-graph-description,rouge2_fmeasure,0.08905956074973984
510
+ web_nlg_en,5,non-explicit-description,rouge2_fmeasure,0.14969245864050304
511
+ web_nlg_en,5,very-explicit-description,rouge2_fmeasure,0.11088772044085522
512
+ web_nlg_en,5,median,rouge2_fmeasure,0.11088772044085522
513
+ web_nlg_en,5,average,multiple,0.09071696843284373
514
+ wiki_lingua_en,0,article_summary_en,rouge2_fmeasure,0.0401546744710191
515
+ wiki_lingua_en,0,rephrase_en,rouge2_fmeasure,0.011775575519769068
516
+ wiki_lingua_en,0,summarize_above_en,rouge2_fmeasure,0.017208857113077958
517
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03393757157227001
518
+ wiki_lingua_en,0,write_abstract_en,rouge2_fmeasure,0.018163595558952392
519
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.018163595558952392
520
+ wiki_lingua_en,1,article_summary_en,rouge2_fmeasure,0.04582510801169016
521
+ wiki_lingua_en,1,rephrase_en,rouge2_fmeasure,0.028480082558715494
522
+ wiki_lingua_en,1,summarize_above_en,rouge2_fmeasure,0.03420510386300404
523
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05423182118294372
524
+ wiki_lingua_en,1,write_abstract_en,rouge2_fmeasure,0.015006978500477466
525
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.03420510386300404
526
+ wiki_lingua_en,2,article_summary_en,rouge2_fmeasure,0.05097602932656367
527
+ wiki_lingua_en,2,rephrase_en,rouge2_fmeasure,0.04342414284752008
528
+ wiki_lingua_en,2,summarize_above_en,rouge2_fmeasure,0.04558123756721243
529
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05619324678157442
530
+ wiki_lingua_en,2,write_abstract_en,rouge2_fmeasure,0.015011655554296155
531
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.04558123756721243
532
+ wiki_lingua_en,3,article_summary_en,rouge2_fmeasure,0.042273822347749235
533
+ wiki_lingua_en,3,rephrase_en,rouge2_fmeasure,0.03703687600072442
534
+ wiki_lingua_en,3,summarize_above_en,rouge2_fmeasure,0.04051754398323109
535
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04775424867054453
536
+ wiki_lingua_en,3,write_abstract_en,rouge2_fmeasure,0.015436277594316166
537
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04051754398323109
538
+ wiki_lingua_en,4,article_summary_en,rouge2_fmeasure,0.013928620381908459
539
+ wiki_lingua_en,4,rephrase_en,rouge2_fmeasure,0.011604393750423206
540
+ wiki_lingua_en,4,summarize_above_en,rouge2_fmeasure,0.01222026826951555
541
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.014607128479951145
542
+ wiki_lingua_en,4,write_abstract_en,rouge2_fmeasure,0.003915565527437388
543
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.01222026826951555
544
+ wiki_lingua_en,5,article_summary_en,rouge2_fmeasure,0.0020632447863588753
545
+ wiki_lingua_en,5,rephrase_en,rouge2_fmeasure,0.0018773141620031116
546
+ wiki_lingua_en,5,summarize_above_en,rouge2_fmeasure,0.0014447433493777688
547
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0027143726441978717
548
+ wiki_lingua_en,5,write_abstract_en,rouge2_fmeasure,0.00024341445452042402
549
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0018773141620031116
550
+ wiki_lingua_en,5,average,multiple,0.025427510567319768
551
+ winogrande,0,Replace,acc,0.5090765588003157
552
+ winogrande,0,True or False,acc,0.4956590370955012
553
+ winogrande,0,does underscore refer to,acc,0.5082872928176796
554
+ winogrande,0,stand for,acc,0.5082872928176796
555
+ winogrande,0,underscore refer to,acc,0.4956590370955012
556
+ winogrande,0,median,accuracy,0.5082872928176796
557
+ winogrande,1,Replace,acc,0.4964483030781373
558
+ winogrande,1,True or False,acc,0.5082872928176796
559
+ winogrande,1,does underscore refer to,acc,0.5074980268350434
560
+ winogrande,1,stand for,acc,0.4996053670086819
561
+ winogrande,1,underscore refer to,acc,0.4980268350434096
562
+ winogrande,1,median,accuracy,0.4996053670086819
563
+ winogrande,2,Replace,acc,0.4846093133385951
564
+ winogrande,2,True or False,acc,0.489344909234412
565
+ winogrande,2,does underscore refer to,acc,0.5122336227308603
566
+ winogrande,2,stand for,acc,0.5043409629044988
567
+ winogrande,2,underscore refer to,acc,0.4988161010260458
568
+ winogrande,2,median,accuracy,0.4988161010260458
569
+ winogrande,3,Replace,acc,0.5019731649565904
570
+ winogrande,3,True or False,acc,0.49013417521704816
571
+ winogrande,3,does underscore refer to,acc,0.5240726124704025
572
+ winogrande,3,stand for,acc,0.4940805051302289
573
+ winogrande,3,underscore refer to,acc,0.5153906866614049
574
+ winogrande,3,median,accuracy,0.5019731649565904
575
+ winogrande,4,Replace,acc,0.4996053670086819
576
+ winogrande,4,True or False,acc,0.5035516969218626
577
+ winogrande,4,does underscore refer to,acc,0.5169692186266772
578
+ winogrande,4,stand for,acc,0.505130228887135
579
+ winogrande,4,underscore refer to,acc,0.5256511444356748
580
+ winogrande,4,median,accuracy,0.505130228887135
581
+ winogrande,5,Replace,acc,0.5035516969218626
582
+ winogrande,5,True or False,acc,0.505130228887135
583
+ winogrande,5,does underscore refer to,acc,0.5169692186266772
584
+ winogrande,5,stand for,acc,0.5327545382794001
585
+ winogrande,5,underscore refer to,acc,0.5256511444356748
586
+ winogrande,5,median,accuracy,0.5169692186266772
587
+ winogrande,5,average,multiple,0.505130228887135
4b284b21bc4/eval/merged.json ADDED
The diff for this file is too large to render. See raw diff
 
4b284b28bc4/eval/merged.csv ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ anli_r1,0,GPT-3 style,acc,0.329
3
+ anli_r1,0,MNLI crowdsource,acc,0.334
4
+ anli_r1,0,can we infer,acc,0.334
5
+ anli_r1,0,guaranteed/possible/impossible,acc,0.332
6
+ anli_r1,0,justified in saying,acc,0.344
7
+ anli_r1,0,median,accuracy,0.334
8
+ anli_r1,1,GPT-3 style,acc,0.355
9
+ anli_r1,1,MNLI crowdsource,acc,0.333
10
+ anli_r1,1,can we infer,acc,0.333
11
+ anli_r1,1,guaranteed/possible/impossible,acc,0.343
12
+ anli_r1,1,justified in saying,acc,0.332
13
+ anli_r1,1,median,accuracy,0.333
14
+ anli_r1,2,GPT-3 style,acc,0.36
15
+ anli_r1,2,MNLI crowdsource,acc,0.352
16
+ anli_r1,2,can we infer,acc,0.355
17
+ anli_r1,2,guaranteed/possible/impossible,acc,0.32
18
+ anli_r1,2,justified in saying,acc,0.351
19
+ anli_r1,2,median,accuracy,0.352
20
+ anli_r1,3,GPT-3 style,acc,0.363
21
+ anli_r1,3,MNLI crowdsource,acc,0.361
22
+ anli_r1,3,can we infer,acc,0.36
23
+ anli_r1,3,guaranteed/possible/impossible,acc,0.326
24
+ anli_r1,3,justified in saying,acc,0.347
25
+ anli_r1,3,median,accuracy,0.36
26
+ anli_r1,4,GPT-3 style,acc,0.349
27
+ anli_r1,4,MNLI crowdsource,acc,0.35
28
+ anli_r1,4,can we infer,acc,0.334
29
+ anli_r1,4,guaranteed/possible/impossible,acc,0.332
30
+ anli_r1,4,justified in saying,acc,0.331
31
+ anli_r1,4,median,accuracy,0.334
32
+ anli_r1,5,GPT-3 style,acc,0.364
33
+ anli_r1,5,MNLI crowdsource,acc,0.351
34
+ anli_r1,5,can we infer,acc,0.338
35
+ anli_r1,5,guaranteed/possible/impossible,acc,0.33
36
+ anli_r1,5,justified in saying,acc,0.33
37
+ anli_r1,5,median,accuracy,0.338
38
+ anli_r1,5,average,multiple,0.3418333333333333
39
+ anli_r2,0,GPT-3 style,acc,0.333
40
+ anli_r2,0,MNLI crowdsource,acc,0.334
41
+ anli_r2,0,can we infer,acc,0.329
42
+ anli_r2,0,guaranteed/possible/impossible,acc,0.333
43
+ anli_r2,0,justified in saying,acc,0.331
44
+ anli_r2,0,median,accuracy,0.333
45
+ anli_r2,1,GPT-3 style,acc,0.315
46
+ anli_r2,1,MNLI crowdsource,acc,0.315
47
+ anli_r2,1,can we infer,acc,0.315
48
+ anli_r2,1,guaranteed/possible/impossible,acc,0.311
49
+ anli_r2,1,justified in saying,acc,0.315
50
+ anli_r2,1,median,accuracy,0.315
51
+ anli_r2,2,GPT-3 style,acc,0.334
52
+ anli_r2,2,MNLI crowdsource,acc,0.316
53
+ anli_r2,2,can we infer,acc,0.324
54
+ anli_r2,2,guaranteed/possible/impossible,acc,0.326
55
+ anli_r2,2,justified in saying,acc,0.32
56
+ anli_r2,2,median,accuracy,0.324
57
+ anli_r2,3,GPT-3 style,acc,0.326
58
+ anli_r2,3,MNLI crowdsource,acc,0.317
59
+ anli_r2,3,can we infer,acc,0.324
60
+ anli_r2,3,guaranteed/possible/impossible,acc,0.341
61
+ anli_r2,3,justified in saying,acc,0.324
62
+ anli_r2,3,median,accuracy,0.324
63
+ anli_r2,4,GPT-3 style,acc,0.34
64
+ anli_r2,4,MNLI crowdsource,acc,0.32
65
+ anli_r2,4,can we infer,acc,0.314
66
+ anli_r2,4,guaranteed/possible/impossible,acc,0.332
67
+ anli_r2,4,justified in saying,acc,0.317
68
+ anli_r2,4,median,accuracy,0.32
69
+ anli_r2,5,GPT-3 style,acc,0.317
70
+ anli_r2,5,MNLI crowdsource,acc,0.312
71
+ anli_r2,5,can we infer,acc,0.321
72
+ anli_r2,5,guaranteed/possible/impossible,acc,0.339
73
+ anli_r2,5,justified in saying,acc,0.331
74
+ anli_r2,5,median,accuracy,0.321
75
+ anli_r2,5,average,multiple,0.32283333333333336
76
+ anli_r3,0,GPT-3 style,acc,0.3275
77
+ anli_r3,0,MNLI crowdsource,acc,0.3375
78
+ anli_r3,0,can we infer,acc,0.32666666666666666
79
+ anli_r3,0,guaranteed/possible/impossible,acc,0.3075
80
+ anli_r3,0,justified in saying,acc,0.3475
81
+ anli_r3,0,median,accuracy,0.3275
82
+ anli_r3,1,GPT-3 style,acc,0.335
83
+ anli_r3,1,MNLI crowdsource,acc,0.335
84
+ anli_r3,1,can we infer,acc,0.33666666666666667
85
+ anli_r3,1,guaranteed/possible/impossible,acc,0.3375
86
+ anli_r3,1,justified in saying,acc,0.3358333333333333
87
+ anli_r3,1,median,accuracy,0.3358333333333333
88
+ anli_r3,2,GPT-3 style,acc,0.32166666666666666
89
+ anli_r3,2,MNLI crowdsource,acc,0.32916666666666666
90
+ anli_r3,2,can we infer,acc,0.31333333333333335
91
+ anli_r3,2,guaranteed/possible/impossible,acc,0.32083333333333336
92
+ anli_r3,2,justified in saying,acc,0.32
93
+ anli_r3,2,median,accuracy,0.32083333333333336
94
+ anli_r3,3,GPT-3 style,acc,0.33166666666666667
95
+ anli_r3,3,MNLI crowdsource,acc,0.3425
96
+ anli_r3,3,can we infer,acc,0.3433333333333333
97
+ anli_r3,3,guaranteed/possible/impossible,acc,0.3275
98
+ anli_r3,3,justified in saying,acc,0.3525
99
+ anli_r3,3,median,accuracy,0.3425
100
+ anli_r3,4,GPT-3 style,acc,0.32166666666666666
101
+ anli_r3,4,MNLI crowdsource,acc,0.335
102
+ anli_r3,4,can we infer,acc,0.3225
103
+ anli_r3,4,guaranteed/possible/impossible,acc,0.3408333333333333
104
+ anli_r3,4,justified in saying,acc,0.31916666666666665
105
+ anli_r3,4,median,accuracy,0.3225
106
+ anli_r3,5,GPT-3 style,acc,0.315
107
+ anli_r3,5,MNLI crowdsource,acc,0.31833333333333336
108
+ anli_r3,5,can we infer,acc,0.31166666666666665
109
+ anli_r3,5,guaranteed/possible/impossible,acc,0.3375
110
+ anli_r3,5,justified in saying,acc,0.315
111
+ anli_r3,5,median,accuracy,0.315
112
+ anli_r3,5,average,multiple,0.3273611111111111
113
+ arc_easy,0,heres_a_problem,acc,0.255050505050505
114
+ arc_easy,0,i_am_hesitating,acc,0.35185185185185186
115
+ arc_easy,0,multiple_choice,acc,0.2354948805460751
116
+ arc_easy,0,pick_the_most_correct_option,acc,0.2563131313131313
117
+ arc_easy,0,qa_options,acc,0.35395622895622897
118
+ arc_easy,0,median,accuracy,0.2563131313131313
119
+ arc_easy,1,heres_a_problem,acc,0.23208191126279865
120
+ arc_easy,1,i_am_hesitating,acc,0.2713310580204778
121
+ arc_easy,1,multiple_choice,acc,0.25
122
+ arc_easy,1,pick_the_most_correct_option,acc,0.24284511784511784
123
+ arc_easy,1,qa_options,acc,0.3291245791245791
124
+ arc_easy,1,median,accuracy,0.25
125
+ arc_easy,2,heres_a_problem,acc,0.2558922558922559
126
+ arc_easy,2,i_am_hesitating,acc,0.3333333333333333
127
+ arc_easy,2,multiple_choice,acc,0.3282828282828283
128
+ arc_easy,2,pick_the_most_correct_option,acc,0.2563131313131313
129
+ arc_easy,2,qa_options,acc,0.32154882154882153
130
+ arc_easy,2,median,accuracy,0.32154882154882153
131
+ arc_easy,3,heres_a_problem,acc,0.22866894197952217
132
+ arc_easy,3,i_am_hesitating,acc,0.335016835016835
133
+ arc_easy,3,multiple_choice,acc,0.26023890784982934
134
+ arc_easy,3,pick_the_most_correct_option,acc,0.24621212121212122
135
+ arc_easy,3,qa_options,acc,0.2738907849829352
136
+ arc_easy,3,median,accuracy,0.26023890784982934
137
+ arc_easy,4,heres_a_problem,acc,0.24061433447098976
138
+ arc_easy,4,i_am_hesitating,acc,0.32407407407407407
139
+ arc_easy,4,multiple_choice,acc,0.26535836177474403
140
+ arc_easy,4,pick_the_most_correct_option,acc,0.24705387205387205
141
+ arc_easy,4,qa_options,acc,0.26023890784982934
142
+ arc_easy,4,median,accuracy,0.26023890784982934
143
+ arc_easy,5,heres_a_problem,acc,0.24663299663299662
144
+ arc_easy,5,i_am_hesitating,acc,0.3202861952861953
145
+ arc_easy,5,multiple_choice,acc,0.257679180887372
146
+ arc_easy,5,pick_the_most_correct_option,acc,0.25252525252525254
147
+ arc_easy,5,qa_options,acc,0.3164983164983165
148
+ arc_easy,5,median,accuracy,0.257679180887372
149
+ arc_easy,5,average,multiple,0.26766982490816393
150
+ boolq,0,GPT-3 Style,acc,0.589
151
+ boolq,0,after_reading,acc,0.6206666666666667
152
+ boolq,0,exercise,acc,0.6226666666666667
153
+ boolq,0,valid_binary,acc,0.49766666666666665
154
+ boolq,0,yes_no_question,acc,0.38966666666666666
155
+ boolq,0,median,accuracy,0.589
156
+ boolq,1,GPT-3 Style,acc,0.6156666666666667
157
+ boolq,1,after_reading,acc,0.5406666666666666
158
+ boolq,1,exercise,acc,0.5423333333333333
159
+ boolq,1,valid_binary,acc,0.5426666666666666
160
+ boolq,1,yes_no_question,acc,0.5406666666666666
161
+ boolq,1,median,accuracy,0.5423333333333333
162
+ boolq,2,GPT-3 Style,acc,0.6273333333333333
163
+ boolq,2,after_reading,acc,0.5963333333333334
164
+ boolq,2,exercise,acc,0.5473333333333333
165
+ boolq,2,valid_binary,acc,0.5913333333333334
166
+ boolq,2,yes_no_question,acc,0.595
167
+ boolq,2,median,accuracy,0.595
168
+ boolq,3,GPT-3 Style,acc,0.6313333333333333
169
+ boolq,3,after_reading,acc,0.613
170
+ boolq,3,exercise,acc,0.546
171
+ boolq,3,valid_binary,acc,0.6136666666666667
172
+ boolq,3,yes_no_question,acc,0.6096666666666667
173
+ boolq,3,median,accuracy,0.613
174
+ boolq,4,GPT-3 Style,acc,0.6323333333333333
175
+ boolq,4,after_reading,acc,0.6173333333333333
176
+ boolq,4,exercise,acc,0.5476666666666666
177
+ boolq,4,valid_binary,acc,0.6156666666666667
178
+ boolq,4,yes_no_question,acc,0.6206666666666667
179
+ boolq,4,median,accuracy,0.6173333333333333
180
+ boolq,5,GPT-3 Style,acc,0.6276666666666667
181
+ boolq,5,after_reading,acc,0.62
182
+ boolq,5,exercise,acc,0.5383333333333333
183
+ boolq,5,valid_binary,acc,0.6183333333333333
184
+ boolq,5,yes_no_question,acc,0.616
185
+ boolq,5,median,accuracy,0.6183333333333333
186
+ boolq,5,average,multiple,0.5958333333333333
187
+ cb,0,GPT-3 style,acc,0.39285714285714285
188
+ cb,0,MNLI crowdsource,acc,0.39285714285714285
189
+ cb,0,can we infer,acc,0.39285714285714285
190
+ cb,0,guaranteed/possible/impossible,acc,0.30357142857142855
191
+ cb,0,justified in saying,acc,0.3392857142857143
192
+ cb,0,median,accuracy,0.39285714285714285
193
+ cb,1,GPT-3 style,acc,0.39285714285714285
194
+ cb,1,MNLI crowdsource,acc,0.39285714285714285
195
+ cb,1,can we infer,acc,0.39285714285714285
196
+ cb,1,guaranteed/possible/impossible,acc,0.35714285714285715
197
+ cb,1,justified in saying,acc,0.39285714285714285
198
+ cb,1,median,accuracy,0.39285714285714285
199
+ cb,2,GPT-3 style,acc,0.44642857142857145
200
+ cb,2,MNLI crowdsource,acc,0.44642857142857145
201
+ cb,2,can we infer,acc,0.44642857142857145
202
+ cb,2,guaranteed/possible/impossible,acc,0.3392857142857143
203
+ cb,2,justified in saying,acc,0.44642857142857145
204
+ cb,2,median,accuracy,0.44642857142857145
205
+ cb,3,GPT-3 style,acc,0.44642857142857145
206
+ cb,3,MNLI crowdsource,acc,0.3392857142857143
207
+ cb,3,can we infer,acc,0.44642857142857145
208
+ cb,3,guaranteed/possible/impossible,acc,0.26785714285714285
209
+ cb,3,justified in saying,acc,0.39285714285714285
210
+ cb,3,median,accuracy,0.39285714285714285
211
+ cb,4,GPT-3 style,acc,0.48214285714285715
212
+ cb,4,MNLI crowdsource,acc,0.35714285714285715
213
+ cb,4,can we infer,acc,0.44642857142857145
214
+ cb,4,guaranteed/possible/impossible,acc,0.21428571428571427
215
+ cb,4,justified in saying,acc,0.4107142857142857
216
+ cb,4,median,accuracy,0.4107142857142857
217
+ cb,5,GPT-3 style,acc,0.44642857142857145
218
+ cb,5,MNLI crowdsource,acc,0.375
219
+ cb,5,can we infer,acc,0.4107142857142857
220
+ cb,5,guaranteed/possible/impossible,acc,0.21428571428571427
221
+ cb,5,justified in saying,acc,0.44642857142857145
222
+ cb,5,median,accuracy,0.4107142857142857
223
+ cb,5,average,multiple,0.40773809523809523
224
+ copa,0,best_option,acc,0.6
225
+ copa,0,cause_effect,acc,0.6
226
+ copa,0,choose,acc,0.6
227
+ copa,0,i_am_hesitating,acc,0.56
228
+ copa,0,plausible_alternatives,acc,0.57
229
+ copa,0,median,accuracy,0.6
230
+ copa,1,best_option,acc,0.5
231
+ copa,1,cause_effect,acc,0.46
232
+ copa,1,choose,acc,0.48
233
+ copa,1,i_am_hesitating,acc,0.47
234
+ copa,1,plausible_alternatives,acc,0.46
235
+ copa,1,median,accuracy,0.47
236
+ copa,2,best_option,acc,0.48
237
+ copa,2,cause_effect,acc,0.43
238
+ copa,2,choose,acc,0.47
239
+ copa,2,i_am_hesitating,acc,0.42
240
+ copa,2,plausible_alternatives,acc,0.44
241
+ copa,2,median,accuracy,0.44
242
+ copa,3,best_option,acc,0.52
243
+ copa,3,cause_effect,acc,0.45
244
+ copa,3,choose,acc,0.44
245
+ copa,3,i_am_hesitating,acc,0.48
246
+ copa,3,plausible_alternatives,acc,0.43
247
+ copa,3,median,accuracy,0.45
248
+ copa,4,best_option,acc,0.53
249
+ copa,4,cause_effect,acc,0.47
250
+ copa,4,choose,acc,0.43
251
+ copa,4,i_am_hesitating,acc,0.45
252
+ copa,4,plausible_alternatives,acc,0.46
253
+ copa,4,median,accuracy,0.46
254
+ copa,5,best_option,acc,0.5
255
+ copa,5,cause_effect,acc,0.47
256
+ copa,5,choose,acc,0.5
257
+ copa,5,i_am_hesitating,acc,0.49
258
+ copa,5,plausible_alternatives,acc,0.46
259
+ copa,5,median,accuracy,0.49
260
+ copa,5,average,multiple,0.485
261
+ e2e_nlg_cleaned,0,coherent_text,rouge2_fmeasure,0.07227415925772734
262
+ e2e_nlg_cleaned,0,create_text_for_me,rouge2_fmeasure,0.01749967959480122
263
+ e2e_nlg_cleaned,0,generate_gramatically_correct_text,rouge2_fmeasure,5.466015466015465e-05
264
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.010022915068112901
265
+ e2e_nlg_cleaned,0,text,rouge2_fmeasure,0.053555609070310047
266
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.01749967959480122
267
+ e2e_nlg_cleaned,1,coherent_text,rouge2_fmeasure,0.1874037992560188
268
+ e2e_nlg_cleaned,1,create_text_for_me,rouge2_fmeasure,0.16624694535550544
269
+ e2e_nlg_cleaned,1,generate_gramatically_correct_text,rouge2_fmeasure,0.034204603336784774
270
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.20489136085595536
271
+ e2e_nlg_cleaned,1,text,rouge2_fmeasure,0.20027753375836946
272
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1874037992560188
273
+ e2e_nlg_cleaned,2,coherent_text,rouge2_fmeasure,0.18976901148520062
274
+ e2e_nlg_cleaned,2,create_text_for_me,rouge2_fmeasure,0.1773205809463223
275
+ e2e_nlg_cleaned,2,generate_gramatically_correct_text,rouge2_fmeasure,0.07093954631020417
276
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2325284196471626
277
+ e2e_nlg_cleaned,2,text,rouge2_fmeasure,0.1872354549740021
278
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.1872354549740021
279
+ e2e_nlg_cleaned,3,coherent_text,rouge2_fmeasure,0.19128265567370034
280
+ e2e_nlg_cleaned,3,create_text_for_me,rouge2_fmeasure,0.1814061261607152
281
+ e2e_nlg_cleaned,3,generate_gramatically_correct_text,rouge2_fmeasure,0.09632029448061452
282
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.24388713793667496
283
+ e2e_nlg_cleaned,3,text,rouge2_fmeasure,0.18544033119790024
284
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.18544033119790024
285
+ e2e_nlg_cleaned,4,coherent_text,rouge2_fmeasure,0.18797801160921448
286
+ e2e_nlg_cleaned,4,create_text_for_me,rouge2_fmeasure,0.1839557774098788
287
+ e2e_nlg_cleaned,4,generate_gramatically_correct_text,rouge2_fmeasure,0.11048253025142175
288
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.24852828649672148
289
+ e2e_nlg_cleaned,4,text,rouge2_fmeasure,0.18624803563290596
290
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.18624803563290596
291
+ e2e_nlg_cleaned,5,coherent_text,rouge2_fmeasure,0.18412349501175196
292
+ e2e_nlg_cleaned,5,create_text_for_me,rouge2_fmeasure,0.1840109955212137
293
+ e2e_nlg_cleaned,5,generate_gramatically_correct_text,rouge2_fmeasure,0.11427201259332177
294
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.24634621400768708
295
+ e2e_nlg_cleaned,5,text,rouge2_fmeasure,0.18362949663211528
296
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.1840109955212137
297
+ e2e_nlg_cleaned,5,average,multiple,0.157973049362807
298
+ gem_xsum,0,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.021087066079578522
299
+ gem_xsum,0,DOC_tldr,rouge2_fmeasure,0.05712816335799515
300
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.0511787638415587
301
+ gem_xsum,0,summarize_DOC,rouge2_fmeasure,0.04876836268401361
302
+ gem_xsum,0,summarize_this_DOC_summary,rouge2_fmeasure,0.058199865858872574
303
+ gem_xsum,0,median,rouge2_fmeasure,0.0511787638415587
304
+ gem_xsum,1,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.01583906203117334
305
+ gem_xsum,1,DOC_tldr,rouge2_fmeasure,0.052391077282527426
306
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04515071736102295
307
+ gem_xsum,1,summarize_DOC,rouge2_fmeasure,0.044490758416455556
308
+ gem_xsum,1,summarize_this_DOC_summary,rouge2_fmeasure,0.04062921012242493
309
+ gem_xsum,1,median,rouge2_fmeasure,0.044490758416455556
310
+ gem_xsum,2,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.025696677971347164
311
+ gem_xsum,2,DOC_tldr,rouge2_fmeasure,0.05747020905233346
312
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.047730927310845786
313
+ gem_xsum,2,summarize_DOC,rouge2_fmeasure,0.044164071082009024
314
+ gem_xsum,2,summarize_this_DOC_summary,rouge2_fmeasure,0.04062641993018198
315
+ gem_xsum,2,median,rouge2_fmeasure,0.044164071082009024
316
+ gem_xsum,3,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.03211728661220163
317
+ gem_xsum,3,DOC_tldr,rouge2_fmeasure,0.05413486443607653
318
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04656621187743751
319
+ gem_xsum,3,summarize_DOC,rouge2_fmeasure,0.045340440062370646
320
+ gem_xsum,3,summarize_this_DOC_summary,rouge2_fmeasure,0.040076565671093634
321
+ gem_xsum,3,median,rouge2_fmeasure,0.045340440062370646
322
+ gem_xsum,4,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.009527033234766386
323
+ gem_xsum,4,DOC_tldr,rouge2_fmeasure,0.013394347210809258
324
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010817994039374855
325
+ gem_xsum,4,summarize_DOC,rouge2_fmeasure,0.012207544410113281
326
+ gem_xsum,4,summarize_this_DOC_summary,rouge2_fmeasure,0.00916357714654539
327
+ gem_xsum,4,median,rouge2_fmeasure,0.010817994039374855
328
+ gem_xsum,5,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0
329
+ gem_xsum,5,DOC_tldr,rouge2_fmeasure,0.0002940707111925786
330
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0001299594149643802
331
+ gem_xsum,5,summarize_DOC,rouge2_fmeasure,0.0005146148795680421
332
+ gem_xsum,5,summarize_this_DOC_summary,rouge2_fmeasure,0.0
333
+ gem_xsum,5,median,rouge2_fmeasure,0.0001299594149643802
334
+ gem_xsum,5,average,multiple,0.032686997809455526
335
+ piqa,0,Correct the solution,rouge2_fmeasure,0.24351260224849494
336
+ piqa,0,choose the most appropriate solution,acc,0.49510337323177367
337
+ piqa,0,no prompt needed,rouge2_fmeasure,0.005621669068984276
338
+ piqa,0,pick_correct_choice_index,acc,0.49510337323177367
339
+ piqa,0,what_is_the_correct_ending,acc,0.5669205658324266
340
+ piqa,0,median,accuracy,0.49510337323177367
341
+ piqa,1,Correct the solution,rouge2_fmeasure,0.2728457121204912
342
+ piqa,1,choose the most appropriate solution,acc,0.5021762785636561
343
+ piqa,1,no prompt needed,rouge2_fmeasure,0.005440119305280117
344
+ piqa,1,pick_correct_choice_index,acc,0.500544069640914
345
+ piqa,1,what_is_the_correct_ending,acc,0.5495103373231773
346
+ piqa,1,median,accuracy,0.5021762785636561
347
+ piqa,2,Correct the solution,rouge2_fmeasure,0.4813163134151628
348
+ piqa,2,choose the most appropriate solution,acc,0.4929270946681175
349
+ piqa,2,no prompt needed,rouge2_fmeasure,0.004831334688117916
350
+ piqa,2,pick_correct_choice_index,acc,0.48748639825897716
351
+ piqa,2,what_is_the_correct_ending,acc,0.529923830250272
352
+ piqa,2,median,accuracy,0.4929270946681175
353
+ piqa,3,Correct the solution,rouge2_fmeasure,0.5130291514276495
354
+ piqa,3,choose the most appropriate solution,acc,0.5065288356909684
355
+ piqa,3,no prompt needed,rouge2_fmeasure,0.004497352970943405
356
+ piqa,3,pick_correct_choice_index,acc,0.4776931447225245
357
+ piqa,3,what_is_the_correct_ending,acc,0.529379760609358
358
+ piqa,3,median,accuracy,0.5065288356909684
359
+ piqa,4,Correct the solution,rouge2_fmeasure,0.5191957202312659
360
+ piqa,4,choose the most appropriate solution,acc,0.5059847660500544
361
+ piqa,4,no prompt needed,rouge2_fmeasure,0.00396783046089762
362
+ piqa,4,pick_correct_choice_index,acc,0.5021762785636561
363
+ piqa,4,what_is_the_correct_ending,acc,0.5277475516866159
364
+ piqa,4,median,accuracy,0.5059847660500544
365
+ piqa,5,Correct the solution,rouge2_fmeasure,0.5349097989485111
366
+ piqa,5,choose the most appropriate solution,acc,0.5032644178454843
367
+ piqa,5,no prompt needed,rouge2_fmeasure,0.0042412326403525056
368
+ piqa,5,pick_correct_choice_index,acc,0.4967355821545158
369
+ piqa,5,what_is_the_correct_ending,acc,0.5348204570184983
370
+ piqa,5,median,accuracy,0.5032644178454843
371
+ piqa,5,average,multiple,0.5009974610083424
372
+ sciq,0,Direct Question,acc,0.866
373
+ sciq,0,Direct Question (Closed Book),acc,0.617
374
+ sciq,0,Multiple Choice,acc,0.583
375
+ sciq,0,Multiple Choice (Closed Book),acc,0.46
376
+ sciq,0,Multiple Choice Question First,acc,0.534
377
+ sciq,0,median,accuracy,0.583
378
+ sciq,1,Direct Question,acc,0.9
379
+ sciq,1,Direct Question (Closed Book),acc,0.675
380
+ sciq,1,Multiple Choice,acc,0.507
381
+ sciq,1,Multiple Choice (Closed Book),acc,0.457
382
+ sciq,1,Multiple Choice Question First,acc,0.387
383
+ sciq,1,median,accuracy,0.507
384
+ sciq,2,Direct Question,acc,0.901
385
+ sciq,2,Direct Question (Closed Book),acc,0.689
386
+ sciq,2,Multiple Choice,acc,0.548
387
+ sciq,2,Multiple Choice (Closed Book),acc,0.543
388
+ sciq,2,Multiple Choice Question First,acc,0.42
389
+ sciq,2,median,accuracy,0.548
390
+ sciq,3,Direct Question,acc,0.911
391
+ sciq,3,Direct Question (Closed Book),acc,0.696
392
+ sciq,3,Multiple Choice,acc,0.575
393
+ sciq,3,Multiple Choice (Closed Book),acc,0.57
394
+ sciq,3,Multiple Choice Question First,acc,0.42
395
+ sciq,3,median,accuracy,0.575
396
+ sciq,4,Direct Question,acc,0.904
397
+ sciq,4,Direct Question (Closed Book),acc,0.709
398
+ sciq,4,Multiple Choice,acc,0.584
399
+ sciq,4,Multiple Choice (Closed Book),acc,0.565
400
+ sciq,4,Multiple Choice Question First,acc,0.445
401
+ sciq,4,median,accuracy,0.584
402
+ sciq,5,Direct Question,acc,0.906
403
+ sciq,5,Direct Question (Closed Book),acc,0.714
404
+ sciq,5,Multiple Choice,acc,0.581
405
+ sciq,5,Multiple Choice (Closed Book),acc,0.579
406
+ sciq,5,Multiple Choice Question First,acc,0.462
407
+ sciq,5,median,accuracy,0.581
408
+ sciq,5,average,multiple,0.563
409
+ story_cloze_2016,0,Answer Given options,acc,0.49706039551042225
410
+ story_cloze_2016,0,Choose Story Ending,acc,0.48957776590058794
411
+ story_cloze_2016,0,Novel Correct Ending,acc,0.4879743452699091
412
+ story_cloze_2016,0,Story Continuation and Options,acc,0.49438802779262425
413
+ story_cloze_2016,0,median,accuracy,0.4919828968466061
414
+ story_cloze_2016,1,Answer Given options,acc,0.4853019775521112
415
+ story_cloze_2016,1,Choose Story Ending,acc,0.4906467129877071
416
+ story_cloze_2016,1,Novel Correct Ending,acc,0.48102618920363444
417
+ story_cloze_2016,1,Story Continuation and Options,acc,0.4917156600748263
418
+ story_cloze_2016,1,median,accuracy,0.4879743452699091
419
+ story_cloze_2016,2,Answer Given options,acc,0.47888829502939606
420
+ story_cloze_2016,2,Choose Story Ending,acc,0.47728487439871725
421
+ story_cloze_2016,2,Novel Correct Ending,acc,0.4751469802244789
422
+ story_cloze_2016,2,Story Continuation and Options,acc,0.47995724211651525
423
+ story_cloze_2016,2,median,accuracy,0.47808658471405663
424
+ story_cloze_2016,3,Answer Given options,acc,0.4735435595938001
425
+ story_cloze_2016,3,Choose Story Ending,acc,0.4820951362907536
426
+ story_cloze_2016,3,Novel Correct Ending,acc,0.4740780331373597
427
+ story_cloze_2016,3,Story Continuation and Options,acc,0.4901122394441475
428
+ story_cloze_2016,3,median,accuracy,0.47808658471405663
429
+ story_cloze_2016,4,Answer Given options,acc,0.46178514163548906
430
+ story_cloze_2016,4,Choose Story Ending,acc,0.4730090860502405
431
+ story_cloze_2016,4,Novel Correct Ending,acc,0.4681988241582042
432
+ story_cloze_2016,4,Story Continuation and Options,acc,0.4879743452699091
433
+ story_cloze_2016,4,median,accuracy,0.47060395510422237
434
+ story_cloze_2016,5,Answer Given options,acc,0.46178514163548906
435
+ story_cloze_2016,5,Choose Story Ending,acc,0.4826296098343132
436
+ story_cloze_2016,5,Novel Correct Ending,acc,0.4719401389631213
437
+ story_cloze_2016,5,Story Continuation and Options,acc,0.49438802779262425
438
+ story_cloze_2016,5,median,accuracy,0.47728487439871725
439
+ story_cloze_2016,5,average,multiple,0.480669873507928
440
+ superglue_rte,0,GPT-3 style,acc,0.5090252707581228
441
+ superglue_rte,0,MNLI crowdsource,acc,0.48014440433212996
442
+ superglue_rte,0,does it follow that,acc,0.44404332129963897
443
+ superglue_rte,0,guaranteed true,acc,0.5126353790613718
444
+ superglue_rte,0,should assume,acc,0.5415162454873647
445
+ superglue_rte,0,median,accuracy,0.5090252707581228
446
+ superglue_rte,1,GPT-3 style,acc,0.5090252707581228
447
+ superglue_rte,1,MNLI crowdsource,acc,0.49097472924187724
448
+ superglue_rte,1,does it follow that,acc,0.49097472924187724
449
+ superglue_rte,1,guaranteed true,acc,0.49097472924187724
450
+ superglue_rte,1,should assume,acc,0.48375451263537905
451
+ superglue_rte,1,median,accuracy,0.49097472924187724
452
+ superglue_rte,2,GPT-3 style,acc,0.516245487364621
453
+ superglue_rte,2,MNLI crowdsource,acc,0.5054151624548736
454
+ superglue_rte,2,does it follow that,acc,0.516245487364621
455
+ superglue_rte,2,guaranteed true,acc,0.5018050541516246
456
+ superglue_rte,2,should assume,acc,0.516245487364621
457
+ superglue_rte,2,median,accuracy,0.516245487364621
458
+ superglue_rte,3,GPT-3 style,acc,0.5234657039711191
459
+ superglue_rte,3,MNLI crowdsource,acc,0.5270758122743683
460
+ superglue_rte,3,does it follow that,acc,0.5379061371841155
461
+ superglue_rte,3,guaranteed true,acc,0.51985559566787
462
+ superglue_rte,3,should assume,acc,0.5306859205776173
463
+ superglue_rte,3,median,accuracy,0.5270758122743683
464
+ superglue_rte,4,GPT-3 style,acc,0.5126353790613718
465
+ superglue_rte,4,MNLI crowdsource,acc,0.5379061371841155
466
+ superglue_rte,4,does it follow that,acc,0.51985559566787
467
+ superglue_rte,4,guaranteed true,acc,0.5342960288808665
468
+ superglue_rte,4,should assume,acc,0.5234657039711191
469
+ superglue_rte,4,median,accuracy,0.5234657039711191
470
+ superglue_rte,5,GPT-3 style,acc,0.5306859205776173
471
+ superglue_rte,5,MNLI crowdsource,acc,0.5090252707581228
472
+ superglue_rte,5,does it follow that,acc,0.5234657039711191
473
+ superglue_rte,5,guaranteed true,acc,0.5270758122743683
474
+ superglue_rte,5,should assume,acc,0.5306859205776173
475
+ superglue_rte,5,median,accuracy,0.5270758122743683
476
+ superglue_rte,5,average,multiple,0.5156438026474128
477
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.049917192299013896
478
+ web_nlg_en,0,explicit-graph-description2,rouge2_fmeasure,0.00537375341399136
479
+ web_nlg_en,0,implicit-graph-description,rouge2_fmeasure,0.0034599979122394713
480
+ web_nlg_en,0,non-explicit-description,rouge2_fmeasure,0.0007679014767691473
481
+ web_nlg_en,0,very-explicit-description,rouge2_fmeasure,0.0005675109134810492
482
+ web_nlg_en,0,median,rouge2_fmeasure,0.0034599979122394713
483
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05553061893758205
484
+ web_nlg_en,1,explicit-graph-description2,rouge2_fmeasure,0.1493719195270224
485
+ web_nlg_en,1,implicit-graph-description,rouge2_fmeasure,0.06073259012334097
486
+ web_nlg_en,1,non-explicit-description,rouge2_fmeasure,0.12187611759524282
487
+ web_nlg_en,1,very-explicit-description,rouge2_fmeasure,0.058019313464160664
488
+ web_nlg_en,1,median,rouge2_fmeasure,0.06073259012334097
489
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.057331612844470456
490
+ web_nlg_en,2,explicit-graph-description2,rouge2_fmeasure,0.29741471013327875
491
+ web_nlg_en,2,implicit-graph-description,rouge2_fmeasure,0.10509886424101751
492
+ web_nlg_en,2,non-explicit-description,rouge2_fmeasure,0.13915045372935977
493
+ web_nlg_en,2,very-explicit-description,rouge2_fmeasure,0.18536983565498788
494
+ web_nlg_en,2,median,rouge2_fmeasure,0.13915045372935977
495
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05836966723015618
496
+ web_nlg_en,3,explicit-graph-description2,rouge2_fmeasure,0.31908027874923556
497
+ web_nlg_en,3,implicit-graph-description,rouge2_fmeasure,0.12855310203356
498
+ web_nlg_en,3,non-explicit-description,rouge2_fmeasure,0.13862295274121508
499
+ web_nlg_en,3,very-explicit-description,rouge2_fmeasure,0.26767500932744387
500
+ web_nlg_en,3,median,rouge2_fmeasure,0.13862295274121508
501
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.0577700863367864
502
+ web_nlg_en,4,explicit-graph-description2,rouge2_fmeasure,0.32559945531385337
503
+ web_nlg_en,4,implicit-graph-description,rouge2_fmeasure,0.13327902883600057
504
+ web_nlg_en,4,non-explicit-description,rouge2_fmeasure,0.13292917476889163
505
+ web_nlg_en,4,very-explicit-description,rouge2_fmeasure,0.24713173271908043
506
+ web_nlg_en,4,median,rouge2_fmeasure,0.13327902883600057
507
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05951196634046783
508
+ web_nlg_en,5,explicit-graph-description2,rouge2_fmeasure,0.3330531711847648
509
+ web_nlg_en,5,implicit-graph-description,rouge2_fmeasure,0.13440771646832117
510
+ web_nlg_en,5,non-explicit-description,rouge2_fmeasure,0.1292376475089326
511
+ web_nlg_en,5,very-explicit-description,rouge2_fmeasure,0.2256483945335245
512
+ web_nlg_en,5,median,rouge2_fmeasure,0.13440771646832117
513
+ web_nlg_en,5,average,multiple,0.10160878996841284
514
+ wiki_lingua_en,0,article_summary_en,rouge2_fmeasure,0.04599914076874335
515
+ wiki_lingua_en,0,rephrase_en,rouge2_fmeasure,0.014876069868397498
516
+ wiki_lingua_en,0,summarize_above_en,rouge2_fmeasure,0.01758898085911187
517
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03601951697280678
518
+ wiki_lingua_en,0,write_abstract_en,rouge2_fmeasure,0.015984403876231276
519
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.01758898085911187
520
+ wiki_lingua_en,1,article_summary_en,rouge2_fmeasure,0.040864899057913275
521
+ wiki_lingua_en,1,rephrase_en,rouge2_fmeasure,0.02017226312757468
522
+ wiki_lingua_en,1,summarize_above_en,rouge2_fmeasure,0.025193685294665726
523
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04757609861819433
524
+ wiki_lingua_en,1,write_abstract_en,rouge2_fmeasure,0.020818331143036432
525
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.025193685294665726
526
+ wiki_lingua_en,2,article_summary_en,rouge2_fmeasure,0.04643912783550571
527
+ wiki_lingua_en,2,rephrase_en,rouge2_fmeasure,0.040268213401871214
528
+ wiki_lingua_en,2,summarize_above_en,rouge2_fmeasure,0.040241799290576724
529
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05650249608530642
530
+ wiki_lingua_en,2,write_abstract_en,rouge2_fmeasure,0.0219486272286028
531
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.040268213401871214
532
+ wiki_lingua_en,3,article_summary_en,rouge2_fmeasure,0.0414745893981831
533
+ wiki_lingua_en,3,rephrase_en,rouge2_fmeasure,0.03686577574547315
534
+ wiki_lingua_en,3,summarize_above_en,rouge2_fmeasure,0.03722078217429424
535
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05005886014366939
536
+ wiki_lingua_en,3,write_abstract_en,rouge2_fmeasure,0.017248853072901214
537
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.03722078217429424
538
+ wiki_lingua_en,4,article_summary_en,rouge2_fmeasure,0.013892451555974773
539
+ wiki_lingua_en,4,rephrase_en,rouge2_fmeasure,0.012358132206090394
540
+ wiki_lingua_en,4,summarize_above_en,rouge2_fmeasure,0.011579301131776956
541
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.015594437236270214
542
+ wiki_lingua_en,4,write_abstract_en,rouge2_fmeasure,0.0040223276062826534
543
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.012358132206090394
544
+ wiki_lingua_en,5,article_summary_en,rouge2_fmeasure,0.002181790528509318
545
+ wiki_lingua_en,5,rephrase_en,rouge2_fmeasure,0.0017789109729195612
546
+ wiki_lingua_en,5,summarize_above_en,rouge2_fmeasure,0.0015940878813423497
547
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0024833328621297794
548
+ wiki_lingua_en,5,write_abstract_en,rouge2_fmeasure,0.0003338217989499952
549
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0017789109729195612
550
+ wiki_lingua_en,5,average,multiple,0.022401450818158836
551
+ winogrande,0,Replace,acc,0.49013417521704816
552
+ winogrande,0,True or False,acc,0.4956590370955012
553
+ winogrande,0,does underscore refer to,acc,0.4940805051302289
554
+ winogrande,0,stand for,acc,0.500394632991318
555
+ winogrande,0,underscore refer to,acc,0.4861878453038674
556
+ winogrande,0,median,accuracy,0.4940805051302289
557
+ winogrande,1,Replace,acc,0.5098658247829518
558
+ winogrande,1,True or False,acc,0.494869771112865
559
+ winogrande,1,does underscore refer to,acc,0.505130228887135
560
+ winogrande,1,stand for,acc,0.5209155485398579
561
+ winogrande,1,underscore refer to,acc,0.5122336227308603
562
+ winogrande,1,median,accuracy,0.5098658247829518
563
+ winogrande,2,Replace,acc,0.5177584846093133
564
+ winogrande,2,True or False,acc,0.4956590370955012
565
+ winogrande,2,does underscore refer to,acc,0.5303867403314917
566
+ winogrande,2,stand for,acc,0.5240726124704025
567
+ winogrande,2,underscore refer to,acc,0.5146014206787688
568
+ winogrande,2,median,accuracy,0.5177584846093133
569
+ winogrande,3,Replace,acc,0.5240726124704025
570
+ winogrande,3,True or False,acc,0.4988161010260458
571
+ winogrande,3,does underscore refer to,acc,0.5272296764009471
572
+ winogrande,3,stand for,acc,0.510655090765588
573
+ winogrande,3,underscore refer to,acc,0.5248618784530387
574
+ winogrande,3,median,accuracy,0.5240726124704025
575
+ winogrande,4,Replace,acc,0.5177584846093133
576
+ winogrande,4,True or False,acc,0.5027624309392266
577
+ winogrande,4,does underscore refer to,acc,0.5288082083662194
578
+ winogrande,4,stand for,acc,0.5067087608524072
579
+ winogrande,4,underscore refer to,acc,0.5232833464877664
580
+ winogrande,4,median,accuracy,0.5177584846093133
581
+ winogrande,5,Replace,acc,0.5185477505919495
582
+ winogrande,5,True or False,acc,0.4940805051302289
583
+ winogrande,5,does underscore refer to,acc,0.526440410418311
584
+ winogrande,5,stand for,acc,0.4972375690607735
585
+ winogrande,5,underscore refer to,acc,0.5224940805051302
586
+ winogrande,5,median,accuracy,0.5185477505919495
587
+ winogrande,5,average,multiple,0.5136806103656932
4b284b28bc4/eval/merged.json ADDED
The diff for this file is too large to render. See raw diff
 
4b284b42bc4/eval/merged.csv ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ anli_r1,0,GPT-3 style,acc,0.323
3
+ anli_r1,0,MNLI crowdsource,acc,0.334
4
+ anli_r1,0,can we infer,acc,0.342
5
+ anli_r1,0,guaranteed/possible/impossible,acc,0.334
6
+ anli_r1,0,justified in saying,acc,0.342
7
+ anli_r1,0,median,accuracy,0.334
8
+ anli_r1,1,GPT-3 style,acc,0.324
9
+ anli_r1,1,MNLI crowdsource,acc,0.333
10
+ anli_r1,1,can we infer,acc,0.333
11
+ anli_r1,1,guaranteed/possible/impossible,acc,0.331
12
+ anli_r1,1,justified in saying,acc,0.333
13
+ anli_r1,1,median,accuracy,0.333
14
+ anli_r1,2,GPT-3 style,acc,0.346
15
+ anli_r1,2,MNLI crowdsource,acc,0.359
16
+ anli_r1,2,can we infer,acc,0.341
17
+ anli_r1,2,guaranteed/possible/impossible,acc,0.331
18
+ anli_r1,2,justified in saying,acc,0.337
19
+ anli_r1,2,median,accuracy,0.341
20
+ anli_r1,3,GPT-3 style,acc,0.348
21
+ anli_r1,3,MNLI crowdsource,acc,0.356
22
+ anli_r1,3,can we infer,acc,0.366
23
+ anli_r1,3,guaranteed/possible/impossible,acc,0.334
24
+ anli_r1,3,justified in saying,acc,0.356
25
+ anli_r1,3,median,accuracy,0.356
26
+ anli_r1,4,GPT-3 style,acc,0.325
27
+ anli_r1,4,MNLI crowdsource,acc,0.345
28
+ anli_r1,4,can we infer,acc,0.335
29
+ anli_r1,4,guaranteed/possible/impossible,acc,0.341
30
+ anli_r1,4,justified in saying,acc,0.343
31
+ anli_r1,4,median,accuracy,0.341
32
+ anli_r1,5,GPT-3 style,acc,0.314
33
+ anli_r1,5,MNLI crowdsource,acc,0.353
34
+ anli_r1,5,can we infer,acc,0.328
35
+ anli_r1,5,guaranteed/possible/impossible,acc,0.327
36
+ anli_r1,5,justified in saying,acc,0.326
37
+ anli_r1,5,median,accuracy,0.327
38
+ anli_r1,5,average,multiple,0.33866666666666667
39
+ anli_r2,0,GPT-3 style,acc,0.327
40
+ anli_r2,0,MNLI crowdsource,acc,0.334
41
+ anli_r2,0,can we infer,acc,0.348
42
+ anli_r2,0,guaranteed/possible/impossible,acc,0.336
43
+ anli_r2,0,justified in saying,acc,0.34
44
+ anli_r2,0,median,accuracy,0.336
45
+ anli_r2,1,GPT-3 style,acc,0.309
46
+ anli_r2,1,MNLI crowdsource,acc,0.315
47
+ anli_r2,1,can we infer,acc,0.315
48
+ anli_r2,1,guaranteed/possible/impossible,acc,0.308
49
+ anli_r2,1,justified in saying,acc,0.315
50
+ anli_r2,1,median,accuracy,0.315
51
+ anli_r2,2,GPT-3 style,acc,0.317
52
+ anli_r2,2,MNLI crowdsource,acc,0.312
53
+ anli_r2,2,can we infer,acc,0.316
54
+ anli_r2,2,guaranteed/possible/impossible,acc,0.324
55
+ anli_r2,2,justified in saying,acc,0.322
56
+ anli_r2,2,median,accuracy,0.317
57
+ anli_r2,3,GPT-3 style,acc,0.333
58
+ anli_r2,3,MNLI crowdsource,acc,0.305
59
+ anli_r2,3,can we infer,acc,0.32
60
+ anli_r2,3,guaranteed/possible/impossible,acc,0.33
61
+ anli_r2,3,justified in saying,acc,0.315
62
+ anli_r2,3,median,accuracy,0.32
63
+ anli_r2,4,GPT-3 style,acc,0.323
64
+ anli_r2,4,MNLI crowdsource,acc,0.306
65
+ anli_r2,4,can we infer,acc,0.308
66
+ anli_r2,4,guaranteed/possible/impossible,acc,0.311
67
+ anli_r2,4,justified in saying,acc,0.306
68
+ anli_r2,4,median,accuracy,0.308
69
+ anli_r2,5,GPT-3 style,acc,0.327
70
+ anli_r2,5,MNLI crowdsource,acc,0.315
71
+ anli_r2,5,can we infer,acc,0.326
72
+ anli_r2,5,guaranteed/possible/impossible,acc,0.319
73
+ anli_r2,5,justified in saying,acc,0.319
74
+ anli_r2,5,median,accuracy,0.319
75
+ anli_r2,5,average,multiple,0.31916666666666665
76
+ anli_r3,0,GPT-3 style,acc,0.35083333333333333
77
+ anli_r3,0,MNLI crowdsource,acc,0.33416666666666667
78
+ anli_r3,0,can we infer,acc,0.3325
79
+ anli_r3,0,guaranteed/possible/impossible,acc,0.3275
80
+ anli_r3,0,justified in saying,acc,0.33916666666666667
81
+ anli_r3,0,median,accuracy,0.33416666666666667
82
+ anli_r3,1,GPT-3 style,acc,0.3441666666666667
83
+ anli_r3,1,MNLI crowdsource,acc,0.33666666666666667
84
+ anli_r3,1,can we infer,acc,0.33666666666666667
85
+ anli_r3,1,guaranteed/possible/impossible,acc,0.3283333333333333
86
+ anli_r3,1,justified in saying,acc,0.33666666666666667
87
+ anli_r3,1,median,accuracy,0.33666666666666667
88
+ anli_r3,2,GPT-3 style,acc,0.3275
89
+ anli_r3,2,MNLI crowdsource,acc,0.31916666666666665
90
+ anli_r3,2,can we infer,acc,0.32
91
+ anli_r3,2,guaranteed/possible/impossible,acc,0.3125
92
+ anli_r3,2,justified in saying,acc,0.3275
93
+ anli_r3,2,median,accuracy,0.32
94
+ anli_r3,3,GPT-3 style,acc,0.33666666666666667
95
+ anli_r3,3,MNLI crowdsource,acc,0.32916666666666666
96
+ anli_r3,3,can we infer,acc,0.335
97
+ anli_r3,3,guaranteed/possible/impossible,acc,0.3258333333333333
98
+ anli_r3,3,justified in saying,acc,0.3383333333333333
99
+ anli_r3,3,median,accuracy,0.335
100
+ anli_r3,4,GPT-3 style,acc,0.30666666666666664
101
+ anli_r3,4,MNLI crowdsource,acc,0.3275
102
+ anli_r3,4,can we infer,acc,0.3233333333333333
103
+ anli_r3,4,guaranteed/possible/impossible,acc,0.31583333333333335
104
+ anli_r3,4,justified in saying,acc,0.32166666666666666
105
+ anli_r3,4,median,accuracy,0.32166666666666666
106
+ anli_r3,5,GPT-3 style,acc,0.31166666666666665
107
+ anli_r3,5,MNLI crowdsource,acc,0.30833333333333335
108
+ anli_r3,5,can we infer,acc,0.32666666666666666
109
+ anli_r3,5,guaranteed/possible/impossible,acc,0.31416666666666665
110
+ anli_r3,5,justified in saying,acc,0.3233333333333333
111
+ anli_r3,5,median,accuracy,0.31416666666666665
112
+ anli_r3,5,average,multiple,0.3269444444444444
113
+ arc_easy,0,heres_a_problem,acc,0.24494949494949494
114
+ arc_easy,0,i_am_hesitating,acc,0.25170648464163825
115
+ arc_easy,0,multiple_choice,acc,0.24488054607508533
116
+ arc_easy,0,pick_the_most_correct_option,acc,0.23947811447811448
117
+ arc_easy,0,qa_options,acc,0.2619453924914676
118
+ arc_easy,0,median,accuracy,0.24494949494949494
119
+ arc_easy,1,heres_a_problem,acc,0.2380546075085324
120
+ arc_easy,1,i_am_hesitating,acc,0.3560606060606061
121
+ arc_easy,1,multiple_choice,acc,0.24914675767918087
122
+ arc_easy,1,pick_the_most_correct_option,acc,0.23526936026936027
123
+ arc_easy,1,qa_options,acc,0.26791808873720135
124
+ arc_easy,1,median,accuracy,0.24914675767918087
125
+ arc_easy,2,heres_a_problem,acc,0.242003367003367
126
+ arc_easy,2,i_am_hesitating,acc,0.3480639730639731
127
+ arc_easy,2,multiple_choice,acc,0.35353535353535354
128
+ arc_easy,2,pick_the_most_correct_option,acc,0.2431740614334471
129
+ arc_easy,2,qa_options,acc,0.34553872053872053
130
+ arc_easy,2,median,accuracy,0.34553872053872053
131
+ arc_easy,3,heres_a_problem,acc,0.2478956228956229
132
+ arc_easy,3,i_am_hesitating,acc,0.26535836177474403
133
+ arc_easy,3,multiple_choice,acc,0.34553872053872053
134
+ arc_easy,3,pick_the_most_correct_option,acc,0.2474747474747475
135
+ arc_easy,3,qa_options,acc,0.257679180887372
136
+ arc_easy,3,median,accuracy,0.257679180887372
137
+ arc_easy,4,heres_a_problem,acc,0.25341296928327645
138
+ arc_easy,4,i_am_hesitating,acc,0.2593856655290102
139
+ arc_easy,4,multiple_choice,acc,0.2525597269624573
140
+ arc_easy,4,pick_the_most_correct_option,acc,0.2551194539249147
141
+ arc_easy,4,qa_options,acc,0.3341750841750842
142
+ arc_easy,4,median,accuracy,0.2551194539249147
143
+ arc_easy,5,heres_a_problem,acc,0.2354948805460751
144
+ arc_easy,5,i_am_hesitating,acc,0.32365319865319864
145
+ arc_easy,5,multiple_choice,acc,0.2508532423208191
146
+ arc_easy,5,pick_the_most_correct_option,acc,0.25252525252525254
147
+ arc_easy,5,qa_options,acc,0.3261784511784512
148
+ arc_easy,5,median,accuracy,0.25252525252525254
149
+ arc_easy,5,average,multiple,0.26749314341748925
150
+ boolq,0,GPT-3 Style,acc,0.538
151
+ boolq,0,after_reading,acc,0.6233333333333333
152
+ boolq,0,exercise,acc,0.623
153
+ boolq,0,valid_binary,acc,0.5896666666666667
154
+ boolq,0,yes_no_question,acc,0.5293333333333333
155
+ boolq,0,median,accuracy,0.5896666666666667
156
+ boolq,1,GPT-3 Style,acc,0.5356666666666666
157
+ boolq,1,after_reading,acc,0.5406666666666666
158
+ boolq,1,exercise,acc,0.5566666666666666
159
+ boolq,1,valid_binary,acc,0.5423333333333333
160
+ boolq,1,yes_no_question,acc,0.5406666666666666
161
+ boolq,1,median,accuracy,0.5406666666666666
162
+ boolq,2,GPT-3 Style,acc,0.5443333333333333
163
+ boolq,2,after_reading,acc,0.5396666666666666
164
+ boolq,2,exercise,acc,0.5536666666666666
165
+ boolq,2,valid_binary,acc,0.5706666666666667
166
+ boolq,2,yes_no_question,acc,0.48233333333333334
167
+ boolq,2,median,accuracy,0.5443333333333333
168
+ boolq,3,GPT-3 Style,acc,0.5566666666666666
169
+ boolq,3,after_reading,acc,0.539
170
+ boolq,3,exercise,acc,0.5583333333333333
171
+ boolq,3,valid_binary,acc,0.5633333333333334
172
+ boolq,3,yes_no_question,acc,0.4676666666666667
173
+ boolq,3,median,accuracy,0.5566666666666666
174
+ boolq,4,GPT-3 Style,acc,0.5656666666666667
175
+ boolq,4,after_reading,acc,0.527
176
+ boolq,4,exercise,acc,0.57
177
+ boolq,4,valid_binary,acc,0.5543333333333333
178
+ boolq,4,yes_no_question,acc,0.481
179
+ boolq,4,median,accuracy,0.5543333333333333
180
+ boolq,5,GPT-3 Style,acc,0.5716666666666667
181
+ boolq,5,after_reading,acc,0.5133333333333333
182
+ boolq,5,exercise,acc,0.567
183
+ boolq,5,valid_binary,acc,0.561
184
+ boolq,5,yes_no_question,acc,0.47733333333333333
185
+ boolq,5,median,accuracy,0.561
186
+ boolq,5,average,multiple,0.5577777777777778
187
+ cb,0,GPT-3 style,acc,0.35714285714285715
188
+ cb,0,MNLI crowdsource,acc,0.4107142857142857
189
+ cb,0,can we infer,acc,0.4642857142857143
190
+ cb,0,guaranteed/possible/impossible,acc,0.14285714285714285
191
+ cb,0,justified in saying,acc,0.35714285714285715
192
+ cb,0,median,accuracy,0.35714285714285715
193
+ cb,1,GPT-3 style,acc,0.39285714285714285
194
+ cb,1,MNLI crowdsource,acc,0.39285714285714285
195
+ cb,1,can we infer,acc,0.39285714285714285
196
+ cb,1,guaranteed/possible/impossible,acc,0.375
197
+ cb,1,justified in saying,acc,0.39285714285714285
198
+ cb,1,median,accuracy,0.39285714285714285
199
+ cb,2,GPT-3 style,acc,0.375
200
+ cb,2,MNLI crowdsource,acc,0.4642857142857143
201
+ cb,2,can we infer,acc,0.39285714285714285
202
+ cb,2,guaranteed/possible/impossible,acc,0.375
203
+ cb,2,justified in saying,acc,0.4107142857142857
204
+ cb,2,median,accuracy,0.39285714285714285
205
+ cb,3,GPT-3 style,acc,0.35714285714285715
206
+ cb,3,MNLI crowdsource,acc,0.5357142857142857
207
+ cb,3,can we infer,acc,0.44642857142857145
208
+ cb,3,guaranteed/possible/impossible,acc,0.35714285714285715
209
+ cb,3,justified in saying,acc,0.44642857142857145
210
+ cb,3,median,accuracy,0.44642857142857145
211
+ cb,4,GPT-3 style,acc,0.3392857142857143
212
+ cb,4,MNLI crowdsource,acc,0.4642857142857143
213
+ cb,4,can we infer,acc,0.5357142857142857
214
+ cb,4,guaranteed/possible/impossible,acc,0.42857142857142855
215
+ cb,4,justified in saying,acc,0.44642857142857145
216
+ cb,4,median,accuracy,0.44642857142857145
217
+ cb,5,GPT-3 style,acc,0.30357142857142855
218
+ cb,5,MNLI crowdsource,acc,0.5
219
+ cb,5,can we infer,acc,0.5
220
+ cb,5,guaranteed/possible/impossible,acc,0.35714285714285715
221
+ cb,5,justified in saying,acc,0.42857142857142855
222
+ cb,5,median,accuracy,0.42857142857142855
223
+ cb,5,average,multiple,0.4107142857142857
224
+ copa,0,best_option,acc,0.54
225
+ copa,0,cause_effect,acc,0.6
226
+ copa,0,choose,acc,0.61
227
+ copa,0,i_am_hesitating,acc,0.62
228
+ copa,0,plausible_alternatives,acc,0.61
229
+ copa,0,median,accuracy,0.61
230
+ copa,1,best_option,acc,0.57
231
+ copa,1,cause_effect,acc,0.47
232
+ copa,1,choose,acc,0.47
233
+ copa,1,i_am_hesitating,acc,0.49
234
+ copa,1,plausible_alternatives,acc,0.42
235
+ copa,1,median,accuracy,0.47
236
+ copa,2,best_option,acc,0.57
237
+ copa,2,cause_effect,acc,0.44
238
+ copa,2,choose,acc,0.48
239
+ copa,2,i_am_hesitating,acc,0.45
240
+ copa,2,plausible_alternatives,acc,0.43
241
+ copa,2,median,accuracy,0.45
242
+ copa,3,best_option,acc,0.57
243
+ copa,3,cause_effect,acc,0.48
244
+ copa,3,choose,acc,0.49
245
+ copa,3,i_am_hesitating,acc,0.44
246
+ copa,3,plausible_alternatives,acc,0.43
247
+ copa,3,median,accuracy,0.48
248
+ copa,4,best_option,acc,0.59
249
+ copa,4,cause_effect,acc,0.48
250
+ copa,4,choose,acc,0.47
251
+ copa,4,i_am_hesitating,acc,0.48
252
+ copa,4,plausible_alternatives,acc,0.45
253
+ copa,4,median,accuracy,0.48
254
+ copa,5,best_option,acc,0.55
255
+ copa,5,cause_effect,acc,0.45
256
+ copa,5,choose,acc,0.45
257
+ copa,5,i_am_hesitating,acc,0.44
258
+ copa,5,plausible_alternatives,acc,0.45
259
+ copa,5,median,accuracy,0.45
260
+ copa,5,average,multiple,0.49
261
+ e2e_nlg_cleaned,0,coherent_text,rouge2_fmeasure,0.0768086035487303
262
+ e2e_nlg_cleaned,0,create_text_for_me,rouge2_fmeasure,0.02963754911980315
263
+ e2e_nlg_cleaned,0,generate_gramatically_correct_text,rouge2_fmeasure,0.0
264
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0063826724183375155
265
+ e2e_nlg_cleaned,0,text,rouge2_fmeasure,0.16205539672246214
266
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.02963754911980315
267
+ e2e_nlg_cleaned,1,coherent_text,rouge2_fmeasure,0.1695476123074274
268
+ e2e_nlg_cleaned,1,create_text_for_me,rouge2_fmeasure,0.16266540713305758
269
+ e2e_nlg_cleaned,1,generate_gramatically_correct_text,rouge2_fmeasure,0.06654255399675307
270
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.2056915755809246
271
+ e2e_nlg_cleaned,1,text,rouge2_fmeasure,0.20190477038644114
272
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1695476123074274
273
+ e2e_nlg_cleaned,2,coherent_text,rouge2_fmeasure,0.18139573177296225
274
+ e2e_nlg_cleaned,2,create_text_for_me,rouge2_fmeasure,0.174575698484694
275
+ e2e_nlg_cleaned,2,generate_gramatically_correct_text,rouge2_fmeasure,0.10730240032895362
276
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.22591032128288588
277
+ e2e_nlg_cleaned,2,text,rouge2_fmeasure,0.20141804290762905
278
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.18139573177296225
279
+ e2e_nlg_cleaned,3,coherent_text,rouge2_fmeasure,0.18499822465619842
280
+ e2e_nlg_cleaned,3,create_text_for_me,rouge2_fmeasure,0.17547831360286914
281
+ e2e_nlg_cleaned,3,generate_gramatically_correct_text,rouge2_fmeasure,0.12907057263921567
282
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.23547797340215765
283
+ e2e_nlg_cleaned,3,text,rouge2_fmeasure,0.19894653788829594
284
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.18499822465619842
285
+ e2e_nlg_cleaned,4,coherent_text,rouge2_fmeasure,0.18449634822804548
286
+ e2e_nlg_cleaned,4,create_text_for_me,rouge2_fmeasure,0.1739020471143616
287
+ e2e_nlg_cleaned,4,generate_gramatically_correct_text,rouge2_fmeasure,0.14413942196532828
288
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.23765394178309218
289
+ e2e_nlg_cleaned,4,text,rouge2_fmeasure,0.19618559845779804
290
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.18449634822804548
291
+ e2e_nlg_cleaned,5,coherent_text,rouge2_fmeasure,0.180205137590262
292
+ e2e_nlg_cleaned,5,create_text_for_me,rouge2_fmeasure,0.17145057785135936
293
+ e2e_nlg_cleaned,5,generate_gramatically_correct_text,rouge2_fmeasure,0.15257686050529207
294
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2366049201616526
295
+ e2e_nlg_cleaned,5,text,rouge2_fmeasure,0.19065995281551984
296
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.180205137590262
297
+ e2e_nlg_cleaned,5,average,multiple,0.15504676727911645
298
+ gem_xsum,0,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.021410321262745457
299
+ gem_xsum,0,DOC_tldr,rouge2_fmeasure,0.05123793702073981
300
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04790575968435739
301
+ gem_xsum,0,summarize_DOC,rouge2_fmeasure,0.041608603555378855
302
+ gem_xsum,0,summarize_this_DOC_summary,rouge2_fmeasure,0.04915332059409096
303
+ gem_xsum,0,median,rouge2_fmeasure,0.04790575968435739
304
+ gem_xsum,1,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0187210253001416
305
+ gem_xsum,1,DOC_tldr,rouge2_fmeasure,0.050711870640867504
306
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04413377405232099
307
+ gem_xsum,1,summarize_DOC,rouge2_fmeasure,0.04729852367510724
308
+ gem_xsum,1,summarize_this_DOC_summary,rouge2_fmeasure,0.038642466121148654
309
+ gem_xsum,1,median,rouge2_fmeasure,0.04413377405232099
310
+ gem_xsum,2,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.03174503623834507
311
+ gem_xsum,2,DOC_tldr,rouge2_fmeasure,0.05192590462289213
312
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.046170973346933354
313
+ gem_xsum,2,summarize_DOC,rouge2_fmeasure,0.04973150539674991
314
+ gem_xsum,2,summarize_this_DOC_summary,rouge2_fmeasure,0.04211259064384283
315
+ gem_xsum,2,median,rouge2_fmeasure,0.046170973346933354
316
+ gem_xsum,3,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.03592964212545872
317
+ gem_xsum,3,DOC_tldr,rouge2_fmeasure,0.0504745368841509
318
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04715930396420784
319
+ gem_xsum,3,summarize_DOC,rouge2_fmeasure,0.04868798327580798
320
+ gem_xsum,3,summarize_this_DOC_summary,rouge2_fmeasure,0.0405702032936913
321
+ gem_xsum,3,median,rouge2_fmeasure,0.04715930396420784
322
+ gem_xsum,4,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.009837365340636202
323
+ gem_xsum,4,DOC_tldr,rouge2_fmeasure,0.013418856497160158
324
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.012335864733508397
325
+ gem_xsum,4,summarize_DOC,rouge2_fmeasure,0.012561323055777309
326
+ gem_xsum,4,summarize_this_DOC_summary,rouge2_fmeasure,0.010936982876670076
327
+ gem_xsum,4,median,rouge2_fmeasure,0.012335864733508397
328
+ gem_xsum,5,DOC_boils_down_to_simple_idea_that,rouge2_fmeasure,0.0
329
+ gem_xsum,5,DOC_tldr,rouge2_fmeasure,9.617082045566528e-05
330
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0004337191943913522
331
+ gem_xsum,5,summarize_DOC,rouge2_fmeasure,0.00025865120204742847
332
+ gem_xsum,5,summarize_this_DOC_summary,rouge2_fmeasure,0.00011435105774728416
333
+ gem_xsum,5,median,rouge2_fmeasure,0.00011435105774728416
334
+ gem_xsum,5,average,multiple,0.03297000447317921
335
+ piqa,0,Correct the solution,rouge2_fmeasure,0.20120770859481335
336
+ piqa,0,choose the most appropriate solution,acc,0.48639825897714906
337
+ piqa,0,no prompt needed,rouge2_fmeasure,0.00581609356366873
338
+ piqa,0,pick_correct_choice_index,acc,0.49510337323177367
339
+ piqa,0,what_is_the_correct_ending,acc,0.5663764961915125
340
+ piqa,0,median,accuracy,0.49510337323177367
341
+ piqa,1,Correct the solution,rouge2_fmeasure,0.33159324549421026
342
+ piqa,1,choose the most appropriate solution,acc,0.5032644178454843
343
+ piqa,1,no prompt needed,rouge2_fmeasure,0.005210530351647924
344
+ piqa,1,pick_correct_choice_index,acc,0.49347116430903154
345
+ piqa,1,what_is_the_correct_ending,acc,0.573993471164309
346
+ piqa,1,median,accuracy,0.5032644178454843
347
+ piqa,2,Correct the solution,rouge2_fmeasure,0.38914197156300356
348
+ piqa,2,choose the most appropriate solution,acc,0.5103373231773667
349
+ piqa,2,no prompt needed,rouge2_fmeasure,0.004949579293108938
350
+ piqa,2,pick_correct_choice_index,acc,0.4885745375408052
351
+ piqa,2,what_is_the_correct_ending,acc,0.5696409140369967
352
+ piqa,2,median,accuracy,0.5103373231773667
353
+ piqa,3,Correct the solution,rouge2_fmeasure,0.39761438363429064
354
+ piqa,3,choose the most appropriate solution,acc,0.5048966267682263
355
+ piqa,3,no prompt needed,rouge2_fmeasure,0.004937686316660083
356
+ piqa,3,pick_correct_choice_index,acc,0.5065288356909684
357
+ piqa,3,what_is_the_correct_ending,acc,0.5554951033732318
358
+ piqa,3,median,accuracy,0.5065288356909684
359
+ piqa,4,Correct the solution,rouge2_fmeasure,0.36764860730658894
360
+ piqa,4,choose the most appropriate solution,acc,0.5032644178454843
361
+ piqa,4,no prompt needed,rouge2_fmeasure,0.004548001185708453
362
+ piqa,4,pick_correct_choice_index,acc,0.5081610446137106
363
+ piqa,4,what_is_the_correct_ending,acc,0.5544069640914037
364
+ piqa,4,median,accuracy,0.5081610446137106
365
+ piqa,5,Correct the solution,rouge2_fmeasure,0.3454708512028626
366
+ piqa,5,choose the most appropriate solution,acc,0.5038084874863983
367
+ piqa,5,no prompt needed,rouge2_fmeasure,0.004669858309181997
368
+ piqa,5,pick_correct_choice_index,acc,0.49020674646354734
369
+ piqa,5,what_is_the_correct_ending,acc,0.5554951033732318
370
+ piqa,5,median,accuracy,0.5038084874863983
371
+ piqa,5,average,multiple,0.5045339136742837
372
+ sciq,0,Direct Question,acc,0.867
373
+ sciq,0,Direct Question (Closed Book),acc,0.639
374
+ sciq,0,Multiple Choice,acc,0.601
375
+ sciq,0,Multiple Choice (Closed Book),acc,0.5
376
+ sciq,0,Multiple Choice Question First,acc,0.625
377
+ sciq,0,median,accuracy,0.625
378
+ sciq,1,Direct Question,acc,0.892
379
+ sciq,1,Direct Question (Closed Book),acc,0.679
380
+ sciq,1,Multiple Choice,acc,0.507
381
+ sciq,1,Multiple Choice (Closed Book),acc,0.506
382
+ sciq,1,Multiple Choice Question First,acc,0.42
383
+ sciq,1,median,accuracy,0.507
384
+ sciq,2,Direct Question,acc,0.9
385
+ sciq,2,Direct Question (Closed Book),acc,0.702
386
+ sciq,2,Multiple Choice,acc,0.559
387
+ sciq,2,Multiple Choice (Closed Book),acc,0.539
388
+ sciq,2,Multiple Choice Question First,acc,0.477
389
+ sciq,2,median,accuracy,0.559
390
+ sciq,3,Direct Question,acc,0.909
391
+ sciq,3,Direct Question (Closed Book),acc,0.717
392
+ sciq,3,Multiple Choice,acc,0.607
393
+ sciq,3,Multiple Choice (Closed Book),acc,0.57
394
+ sciq,3,Multiple Choice Question First,acc,0.546
395
+ sciq,3,median,accuracy,0.607
396
+ sciq,4,Direct Question,acc,0.912
397
+ sciq,4,Direct Question (Closed Book),acc,0.716
398
+ sciq,4,Multiple Choice,acc,0.642
399
+ sciq,4,Multiple Choice (Closed Book),acc,0.565
400
+ sciq,4,Multiple Choice Question First,acc,0.574
401
+ sciq,4,median,accuracy,0.642
402
+ sciq,5,Direct Question,acc,0.918
403
+ sciq,5,Direct Question (Closed Book),acc,0.716
404
+ sciq,5,Multiple Choice,acc,0.643
405
+ sciq,5,Multiple Choice (Closed Book),acc,0.577
406
+ sciq,5,Multiple Choice Question First,acc,0.622
407
+ sciq,5,median,accuracy,0.643
408
+ sciq,5,average,multiple,0.5971666666666666
409
+ story_cloze_2016,0,Answer Given options,acc,0.4730090860502405
410
+ story_cloze_2016,0,Choose Story Ending,acc,0.4820951362907536
411
+ story_cloze_2016,0,Novel Correct Ending,acc,0.4820951362907536
412
+ story_cloze_2016,0,Story Continuation and Options,acc,0.46125066809192944
413
+ story_cloze_2016,0,median,accuracy,0.47755211117049706
414
+ story_cloze_2016,1,Answer Given options,acc,0.47140566541956175
415
+ story_cloze_2016,1,Choose Story Ending,acc,0.48583645109567075
416
+ story_cloze_2016,1,Novel Correct Ending,acc,0.4820951362907536
417
+ story_cloze_2016,1,Story Continuation and Options,acc,0.48850881881346875
418
+ story_cloze_2016,1,median,accuracy,0.4839657936932122
419
+ story_cloze_2016,2,Answer Given options,acc,0.47728487439871725
420
+ story_cloze_2016,2,Choose Story Ending,acc,0.48583645109567075
421
+ story_cloze_2016,2,Novel Correct Ending,acc,0.4853019775521112
422
+ story_cloze_2016,2,Story Continuation and Options,acc,0.47728487439871725
423
+ story_cloze_2016,2,median,accuracy,0.48129342597541425
424
+ story_cloze_2016,3,Answer Given options,acc,0.47247461250668094
425
+ story_cloze_2016,3,Choose Story Ending,acc,0.4751469802244789
426
+ story_cloze_2016,3,Novel Correct Ending,acc,0.4794227685729556
427
+ story_cloze_2016,3,Story Continuation and Options,acc,0.4681988241582042
428
+ story_cloze_2016,3,median,accuracy,0.4738107963655799
429
+ story_cloze_2016,4,Answer Given options,acc,0.4730090860502405
430
+ story_cloze_2016,4,Choose Story Ending,acc,0.47247461250668094
431
+ story_cloze_2016,4,Novel Correct Ending,acc,0.4831640833778728
432
+ story_cloze_2016,4,Story Continuation and Options,acc,0.4692677712453234
433
+ story_cloze_2016,4,median,accuracy,0.47274184927846075
434
+ story_cloze_2016,5,Answer Given options,acc,0.47033671833244256
435
+ story_cloze_2016,5,Choose Story Ending,acc,0.4665954035275254
436
+ story_cloze_2016,5,Novel Correct Ending,acc,0.47888829502939606
437
+ story_cloze_2016,5,Story Continuation and Options,acc,0.4740780331373597
438
+ story_cloze_2016,5,median,accuracy,0.4722073757349011
439
+ story_cloze_2016,5,average,multiple,0.4769285587030109
440
+ superglue_rte,0,GPT-3 style,acc,0.4404332129963899
441
+ superglue_rte,0,MNLI crowdsource,acc,0.5523465703971119
442
+ superglue_rte,0,does it follow that,acc,0.5451263537906137
443
+ superglue_rte,0,guaranteed true,acc,0.48014440433212996
444
+ superglue_rte,0,should assume,acc,0.51985559566787
445
+ superglue_rte,0,median,accuracy,0.51985559566787
446
+ superglue_rte,1,GPT-3 style,acc,0.5018050541516246
447
+ superglue_rte,1,MNLI crowdsource,acc,0.49097472924187724
448
+ superglue_rte,1,does it follow that,acc,0.48736462093862815
449
+ superglue_rte,1,guaranteed true,acc,0.49097472924187724
450
+ superglue_rte,1,should assume,acc,0.49097472924187724
451
+ superglue_rte,1,median,accuracy,0.49097472924187724
452
+ superglue_rte,2,GPT-3 style,acc,0.5234657039711191
453
+ superglue_rte,2,MNLI crowdsource,acc,0.5054151624548736
454
+ superglue_rte,2,does it follow that,acc,0.51985559566787
455
+ superglue_rte,2,guaranteed true,acc,0.48375451263537905
456
+ superglue_rte,2,should assume,acc,0.4981949458483754
457
+ superglue_rte,2,median,accuracy,0.5054151624548736
458
+ superglue_rte,3,GPT-3 style,acc,0.555956678700361
459
+ superglue_rte,3,MNLI crowdsource,acc,0.5018050541516246
460
+ superglue_rte,3,does it follow that,acc,0.5306859205776173
461
+ superglue_rte,3,guaranteed true,acc,0.5270758122743683
462
+ superglue_rte,3,should assume,acc,0.516245487364621
463
+ superglue_rte,3,median,accuracy,0.5270758122743683
464
+ superglue_rte,4,GPT-3 style,acc,0.5631768953068592
465
+ superglue_rte,4,MNLI crowdsource,acc,0.47653429602888087
466
+ superglue_rte,4,does it follow that,acc,0.516245487364621
467
+ superglue_rte,4,guaranteed true,acc,0.49097472924187724
468
+ superglue_rte,4,should assume,acc,0.47653429602888087
469
+ superglue_rte,4,median,accuracy,0.49097472924187724
470
+ superglue_rte,5,GPT-3 style,acc,0.5631768953068592
471
+ superglue_rte,5,MNLI crowdsource,acc,0.4584837545126354
472
+ superglue_rte,5,does it follow that,acc,0.5415162454873647
473
+ superglue_rte,5,guaranteed true,acc,0.4729241877256318
474
+ superglue_rte,5,should assume,acc,0.4584837545126354
475
+ superglue_rte,5,median,accuracy,0.4729241877256318
476
+ superglue_rte,5,average,multiple,0.5012033694344163
477
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05344453588119793
478
+ web_nlg_en,0,explicit-graph-description2,rouge2_fmeasure,0.011184314741657642
479
+ web_nlg_en,0,implicit-graph-description,rouge2_fmeasure,0.008207247449935154
480
+ web_nlg_en,0,non-explicit-description,rouge2_fmeasure,0.020385877678818807
481
+ web_nlg_en,0,very-explicit-description,rouge2_fmeasure,3.843511432044316e-06
482
+ web_nlg_en,0,median,rouge2_fmeasure,0.011184314741657642
483
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0604895960614538
484
+ web_nlg_en,1,explicit-graph-description2,rouge2_fmeasure,0.15888598086244612
485
+ web_nlg_en,1,implicit-graph-description,rouge2_fmeasure,0.06685938813989681
486
+ web_nlg_en,1,non-explicit-description,rouge2_fmeasure,0.10697454965926212
487
+ web_nlg_en,1,very-explicit-description,rouge2_fmeasure,0.08423671425300502
488
+ web_nlg_en,1,median,rouge2_fmeasure,0.08423671425300502
489
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.06086364336249341
490
+ web_nlg_en,2,explicit-graph-description2,rouge2_fmeasure,0.2342525950120652
491
+ web_nlg_en,2,implicit-graph-description,rouge2_fmeasure,0.09546217670504362
492
+ web_nlg_en,2,non-explicit-description,rouge2_fmeasure,0.10198081937332562
493
+ web_nlg_en,2,very-explicit-description,rouge2_fmeasure,0.10461100159118264
494
+ web_nlg_en,2,median,rouge2_fmeasure,0.10198081937332562
495
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.06172653863702163
496
+ web_nlg_en,3,explicit-graph-description2,rouge2_fmeasure,0.2270035578776477
497
+ web_nlg_en,3,implicit-graph-description,rouge2_fmeasure,0.10072973751222168
498
+ web_nlg_en,3,non-explicit-description,rouge2_fmeasure,0.10398704909605484
499
+ web_nlg_en,3,very-explicit-description,rouge2_fmeasure,0.0979814053469226
500
+ web_nlg_en,3,median,rouge2_fmeasure,0.10072973751222168
501
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.061883789388597316
502
+ web_nlg_en,4,explicit-graph-description2,rouge2_fmeasure,0.21128493584064525
503
+ web_nlg_en,4,implicit-graph-description,rouge2_fmeasure,0.09678025171714386
504
+ web_nlg_en,4,non-explicit-description,rouge2_fmeasure,0.10086843076345688
505
+ web_nlg_en,4,very-explicit-description,rouge2_fmeasure,0.09599216387839389
506
+ web_nlg_en,4,median,rouge2_fmeasure,0.09678025171714386
507
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.06197974009288303
508
+ web_nlg_en,5,explicit-graph-description2,rouge2_fmeasure,0.20065053485376905
509
+ web_nlg_en,5,implicit-graph-description,rouge2_fmeasure,0.09493898136075185
510
+ web_nlg_en,5,non-explicit-description,rouge2_fmeasure,0.09425690323781327
511
+ web_nlg_en,5,very-explicit-description,rouge2_fmeasure,0.0949140921236195
512
+ web_nlg_en,5,median,rouge2_fmeasure,0.0949140921236195
513
+ web_nlg_en,5,average,multiple,0.08163765495349555
514
+ wiki_lingua_en,0,article_summary_en,rouge2_fmeasure,0.0397636720249358
515
+ wiki_lingua_en,0,rephrase_en,rouge2_fmeasure,0.012721191178839145
516
+ wiki_lingua_en,0,summarize_above_en,rouge2_fmeasure,0.024434310470910135
517
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03327297097578151
518
+ wiki_lingua_en,0,write_abstract_en,rouge2_fmeasure,0.011216041205494898
519
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.024434310470910135
520
+ wiki_lingua_en,1,article_summary_en,rouge2_fmeasure,0.042828281320032364
521
+ wiki_lingua_en,1,rephrase_en,rouge2_fmeasure,0.02711335795670395
522
+ wiki_lingua_en,1,summarize_above_en,rouge2_fmeasure,0.03141526988382421
523
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.056766090400891124
524
+ wiki_lingua_en,1,write_abstract_en,rouge2_fmeasure,0.021015379090322476
525
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.03141526988382421
526
+ wiki_lingua_en,2,article_summary_en,rouge2_fmeasure,0.051091351936466585
527
+ wiki_lingua_en,2,rephrase_en,rouge2_fmeasure,0.04454347983688272
528
+ wiki_lingua_en,2,summarize_above_en,rouge2_fmeasure,0.048234719869698274
529
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.057748452491246806
530
+ wiki_lingua_en,2,write_abstract_en,rouge2_fmeasure,0.01856227196277119
531
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.048234719869698274
532
+ wiki_lingua_en,3,article_summary_en,rouge2_fmeasure,0.04482124580351551
533
+ wiki_lingua_en,3,rephrase_en,rouge2_fmeasure,0.03895725296191616
534
+ wiki_lingua_en,3,summarize_above_en,rouge2_fmeasure,0.04242364396606276
535
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04689779702656875
536
+ wiki_lingua_en,3,write_abstract_en,rouge2_fmeasure,0.013461430147774255
537
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04242364396606276
538
+ wiki_lingua_en,4,article_summary_en,rouge2_fmeasure,0.014133038738676246
539
+ wiki_lingua_en,4,rephrase_en,rouge2_fmeasure,0.012100846520590037
540
+ wiki_lingua_en,4,summarize_above_en,rouge2_fmeasure,0.012171454699532493
541
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.013741746630537094
542
+ wiki_lingua_en,4,write_abstract_en,rouge2_fmeasure,0.003266077788148105
543
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.012171454699532493
544
+ wiki_lingua_en,5,article_summary_en,rouge2_fmeasure,0.002017912016348425
545
+ wiki_lingua_en,5,rephrase_en,rouge2_fmeasure,0.0018127250079443033
546
+ wiki_lingua_en,5,summarize_above_en,rouge2_fmeasure,0.0013784838984500552
547
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0023512305693387013
548
+ wiki_lingua_en,5,write_abstract_en,rouge2_fmeasure,0.0001630519530024726
549
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0018127250079443033
550
+ wiki_lingua_en,5,average,multiple,0.026748687316328696
551
+ winogrande,0,Replace,acc,0.500394632991318
552
+ winogrande,0,True or False,acc,0.494869771112865
553
+ winogrande,0,does underscore refer to,acc,0.4696132596685083
554
+ winogrande,0,stand for,acc,0.49171270718232046
555
+ winogrande,0,underscore refer to,acc,0.49171270718232046
556
+ winogrande,0,median,accuracy,0.49171270718232046
557
+ winogrande,1,Replace,acc,0.5035516969218626
558
+ winogrande,1,True or False,acc,0.4925019731649566
559
+ winogrande,1,does underscore refer to,acc,0.4909234411996843
560
+ winogrande,1,stand for,acc,0.4956590370955012
561
+ winogrande,1,underscore refer to,acc,0.47908445146014206
562
+ winogrande,1,median,accuracy,0.4925019731649566
563
+ winogrande,2,Replace,acc,0.5067087608524072
564
+ winogrande,2,True or False,acc,0.5074980268350434
565
+ winogrande,2,does underscore refer to,acc,0.48303078137332284
566
+ winogrande,2,stand for,acc,0.4909234411996843
567
+ winogrande,2,underscore refer to,acc,0.49171270718232046
568
+ winogrande,2,median,accuracy,0.49171270718232046
569
+ winogrande,3,Replace,acc,0.5217048145224941
570
+ winogrande,3,True or False,acc,0.5067087608524072
571
+ winogrande,3,does underscore refer to,acc,0.494869771112865
572
+ winogrande,3,stand for,acc,0.4980268350434096
573
+ winogrande,3,underscore refer to,acc,0.5138121546961326
574
+ winogrande,3,median,accuracy,0.5067087608524072
575
+ winogrande,4,Replace,acc,0.5177584846093133
576
+ winogrande,4,True or False,acc,0.5059194948697711
577
+ winogrande,4,does underscore refer to,acc,0.49171270718232046
578
+ winogrande,4,stand for,acc,0.5059194948697711
579
+ winogrande,4,underscore refer to,acc,0.5177584846093133
580
+ winogrande,4,median,accuracy,0.5059194948697711
581
+ winogrande,5,Replace,acc,0.5193370165745856
582
+ winogrande,5,True or False,acc,0.5043409629044988
583
+ winogrande,5,does underscore refer to,acc,0.4996053670086819
584
+ winogrande,5,stand for,acc,0.4988161010260458
585
+ winogrande,5,underscore refer to,acc,0.5035516969218626
586
+ winogrande,5,median,accuracy,0.5035516969218626
587
+ winogrande,5,average,multiple,0.4986845566956064
4b284b42bc4/eval/merged.json ADDED
The diff for this file is too large to render. See raw diff
 
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5219512537266193, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0212930030956169}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07218926971887347, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012167661516642302}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.38930719643519357, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004954916384960566}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11405231602258174, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001654644088764185}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03343887788145288, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007607255949351679}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1941492016104801, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003831607721687585}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.0530914234584236, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010851048561266318}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06799635945918656, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011456716958398708}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.36060457316843075, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004393491915214021}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10723485397749022, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015594878594255188}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06917630307565754, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011778495782492331}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3693625063067186, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004599410126151537}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10913009935368997, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015983050972786382}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6352070705519831, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.032408525336239634}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07232274024412362, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0011648296294780197}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.40875337262130357, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005170431645641866}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11545860904836557, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016293321313512947}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03357621675403705, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007258241400547502}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.20421979666303372, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0038705321992327218}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05387918484618448, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001059642873696044}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.0670219793734058, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010711080482606746}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3719335535759742, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004473658738704575}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10673572587703606, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001490340938407792}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06880374952093221, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011085999525674158}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.38592664938862503, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004766870365189962}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10969517770602641, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015436923533265093}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6721480622931579, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04364421620033658}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07312830538850522, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012172573610058964}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.41786672242074735, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00514089320113892}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11682333840421252, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016784638187503072}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03391327016863078, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007434663573137818}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.21097279270777233, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003962368460085698}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05451282884342794, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001075049065599178}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06674973450321794, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001076625707211798}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3763932336189791, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004418572944831253}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10645004232970325, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014835449120631779}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06947768209278911, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011601752390396401}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.39433935812714066, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00474918433328278}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1108401266606722, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015949626703580934}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.45946097160655536, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0069054289444320065}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.4838082934841815, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005353612871054305}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.40318520685428916, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0050543331024213555}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.25981848081805536, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005464287303749197}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.2614431240216551, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004532273813349631}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.22117893046486378, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004159001287103087}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.3790825033320128, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.006113945088066168}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.40459178460001194, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004829957825352463}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.3311134882286364, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004433468116819751}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.40522430638291235, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.006350898587976966}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4260396970904746, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004984787219803331}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.35357342466071756, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004606600028590757}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 4.728695653942699, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.20951842561350045}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.5132210909931271, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006770200037421206}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5085672940783099, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005219014740462864}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.44649942768587786, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004938340775431664}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.2962906481288477, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005484844549091213}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.28550066477044644, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004627831211110698}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.2516831348306221, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004237671368186907}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.42385682852035556, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.006080173471955614}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4235294811450316, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004789452676639955}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.3673926906028998, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004434765375125032}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.4544084219641523, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.006266153839723712}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.44938547317564664, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004876193487205648}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.39330123509502896, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004530237812729094}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 6.212392173804924, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.3162249116972436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.5199440915994101, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006624486616712392}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5191675780106996, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0050701091896996385}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.4537703875613086, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004716616160390303}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.3031051480032843, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005329062576870483}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.2963979153791751, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0044953737902237645}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.2576884269938096, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004005629924060058}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.4266291186121007, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005898467057932758}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4282197768995522, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004680222294006507}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.36978135277630003, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004192067697375704}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.4599896912593643, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.006130864600093757}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4576419090445212, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004749819488017943}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.39852235622906906, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004308718484971879}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 6.5704316145072506, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.21448561410544192}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.5083670715467508, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04793752313730723}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.13731115268889618, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004134328663985652}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5304312650087452, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005137749962807634}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.1770932101083039, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032503426283328517}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.07057132542466306, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002672953645282832}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.2756854307207382, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00443727730222414}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.08858118067497767, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0022195618622123207}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.11318481239908815, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034285009887527296}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.45774658527361894, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004751494427100153}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.14699962330240496, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002639563461886689}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.12133841288359587, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0036717057968418485}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4712273535434232, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004840505439839539}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.15663423920541245, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00293591591942458}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.5689932623794205, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03487597258964387}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.15135031922863815, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0045240430311319964}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5274677588088789, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005168283267163321}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.18658181154870737, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032951371932804074}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.0812976310351736, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.003052517248887665}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.2876975324614106, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004539489201779465}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.09696397066162622, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0023123478544169115}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.12650520735598997, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003931360833531472}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.45320651913396676, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004927916099837654}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.15529696958918174, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027889786826169605}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.13503878881995351, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004117933379311923}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4710862176395619, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004955060111554295}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.16572981825064162, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0029891616403949226}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.5666625357618493, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.035144502313160096}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.16225163818440333, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004930277023326987}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5198390606155927, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005004253324448323}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.19269563367126713, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034429019094789712}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.08999073165958532, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0034053408100276723}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.290142216690991, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004435580660257063}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.10319722625311183, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002497320165770724}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.13522779384865216, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004269818421199203}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4424221249100704, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004698632856913699}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.15984455179709206, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002959046280237486}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.14570044951057284, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004496072255677459}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.46658013872741555, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004743391558616772}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.17246353477949722, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0031557973499711975}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.1819238915458583, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002750353826660657}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.7029413363719536, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003808929381311445}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.26990147215432525, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0029611077960044795}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.08828034705691859, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001756213173675814}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.36957211038213966, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004295231025460387}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.1321765392862873, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002035893685578508}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.1395246877294493, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020982257188054973}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5674658932955065, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004066884538719425}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.20873220289543726, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0022417883144256346}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.15727492631630607, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024406377529898574}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.6165732680882078, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0037569757915729735}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.23372715316587211, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0026271080876903194}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.7848178181585044, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11165273099003932}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.16916143883050352, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002379872667769485}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6810622565859463, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004015102382030035}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.25585929817242026, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0027996781558269953}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.08144241568678086, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014775734418982862}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3576674736285389, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004354118064844373}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.12475959136600752, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0019179222108368782}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.12901201268176216, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017552360200443738}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.547948402844493, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004162509193062588}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.19697428121356045, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0021070987747823585}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.1469702815726482, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002085618531486058}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.6026159583567549, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003950263273368332}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.22282022865304935, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0024572204259037207}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.713664635412786, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08533018956868954}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.1578995517139064, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002188262308646243}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.661260656597642, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004201918635105074}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.24049035651787887, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00265323510540592}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.0765689099285028, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013983221970486452}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3518820790483638, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004418107440625639}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.11816403232435718, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0018236102355301655}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.12155855741792118, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016664561371413115}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5312655407554171, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004223895153456735}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.18655013034032525, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020589747081609495}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.13827095197584383, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019334746802031329}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5862691174388023, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004033599895639771}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.21089639595956103, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0023465624968102092}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.469805707725389, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06490777021012714}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.17794283547769393, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004855332389330506}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6738492336641515, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004349540444134296}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.22783710711426441, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0038760861020319783}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.09239931377519814, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0033174852810728913}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.3553116109965894, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004440024800683586}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.11404511235265344, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0026899526031591296}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.1419890688901028, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004098782466396125}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5573937424840384, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004380416051067475}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.18086674415244586, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003193856695239382}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.15719935748984376, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00433118114964069}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6040019474400884, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004432909544722048}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.20152895642827887, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003482992973760569}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 2.42884817219882, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.099116139881655}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.18751101358462757, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005063466765827528}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6803997964079052, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004146984246148592}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.23996034128160132, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004098823532354845}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.09981197167240217, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0034490088940847723}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.363206517232566, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0044053972626213736}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.1236406159290505, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0029657970432989938}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.14948107867414953, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00430849430541505}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5591640891574615, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004222675152018815}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.19036064787323584, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0034698000115775366}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.16593161383927957, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004490090618309719}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6127549245098377, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0042508539477660486}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.21344647829746557, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0036991334796597887}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 2.648377702538737, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08478382467689291}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.2170502412040914, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0056785623479505346}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6707499226771557, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004197291013190362}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.26187418406645563, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004479195310716335}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.12026175423850448, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.004116366667789958}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.35899246341620966, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004337077760186949}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.13864465330888276, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0034090024819216245}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.1743575606183526, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0049703919122685986}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.5431873257825337, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004174698396559969}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.20774552707149974, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003911098641553765}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.19370196026746925, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.005146435869582002}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.6067084039239514, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004228274627594922}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.23391725441186811, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0040594860490940805}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 2.657390816846655, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07493284644080055}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.18480731121362143, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020792576907124544}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.315121891878082, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00281956327717947}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.21546625995717986, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019044373825407577}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.043591724134436294, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009100610866868915}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.07663865850457913, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016683677543594749}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.05085522769122296, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009806547557348483}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.13136007081848963, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001407474447704256}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.2315354297581157, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022376304823093543}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1545786655331593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001294688576797968}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.17135315593483375, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019310536492561348}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.29331614741662265, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002671841052773937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.19998174393388882, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017739334682302418}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.4986435841452175, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0954009333366125}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.16054946339149265, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023872806268002453}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.26124779652266183, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0034109215483210933}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.18055709030131595, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002291239666041859}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.03844271053495923, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009548717405153122}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.06572671257249793, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001676567990321315}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.043685786411164565, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009951233609164975}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.1146885610289566, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016607411459830235}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.19312647848640802, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026528744495399642}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.1299739944513172, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016020857996698003}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.14933338107720576, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022223746600941365}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.24353771456671877, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032013282199569457}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.16805250413658923, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002137202674713072}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 2.6906547201013016, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10561504116700811}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.05199770725713768, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018998031112635958}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.08654478337265463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003030755442148775}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.05713385129965973, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019394092460641402}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.011716489586243313, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006188408595148423}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.021706213807673947, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001160694947893896}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.013403383907298045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006520713115630765}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.03835312744448057, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014121874773050304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.06471677863725485, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002320636959830996}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.041882673914681416, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014105973125080264}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.048224676522694826, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017648142477153747}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.0803093006676827, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002819219094971394}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.05296871705271611, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018034377431592255}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 0.5289185378545499, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03725901160237639}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.16287938762344858, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002125596227207644}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.26891683127031046, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029264720716827195}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.1876830793081277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020430243309695934}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.0396793386777498, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009455884698469472}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.06791041476534408, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016416822093375813}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.045953265934537524, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010171810986012006}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.12593110207935895, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014937847050657449}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.2160718901557372, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024089218033376556}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.1469663361895663, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014726919857113133}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.15017973881687044, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001965190714131917}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.2491465690438633, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002748477636582947}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.17332222163527497, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018961374186126187}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.8244022964598483, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07935443727949992}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.14381543741495909, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0023667032912517466}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.225629388401208, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003254182304827278}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.15868452002960928, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002240691662636904}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.035731574000179026, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010349488403740507}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.05816887351927841, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0016059454698874895}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.039489525521791025, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000990906960301999}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.1126446519610345, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017869301538348855}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.18198894030060583, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002668023279693901}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.1251846925684636, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016856714050454754}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.13318454491534204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022141082314480645}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.20977905263291247, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0030675162093798812}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.14690611817944446, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020806139758284456}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 2.9288842585821677, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08545851002343076}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.046670256378216396, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018631696503052615}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.07269676929849611, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002709598303284025}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.04931235783073529, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018027220859303127}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.011045473611349471, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006328455956303861}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.019434558915915258, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001153106186544495}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.012225365615349182, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006470992700405894}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.03764501152188949, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00147394811246321}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.060278823007865176, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022646729396056334}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.04000155130112757, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014299390749652758}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.04298247190464882, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001738069209317656}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.06679218344108083, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025083789140632304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.04523563611791948, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001662538594980625}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 0.49677796073841696, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.052625922121863185}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.18099679964244167, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022642402116103914}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.2981323814293587, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00283223986352777}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.20682521295592035, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001994458501946162}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.04315528231168242, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009826962822616337}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.07284671298198961, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001691310520894889}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.04926653251882711, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001025431194341212}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.13913612901543207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001621884412931162}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.2382490384217888, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002334550886161128}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.16091879232181885, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014094494994621246}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.16710754510866463, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002120469796828209}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.27654018110872997, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002684831096463094}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.1910349072329293, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018521715266141294}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 2.8561556111879174, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0728982607399944}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.1589750600600723, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026278503972424322}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.24856960492614927, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0034290945996938577}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.17315198648906838, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023395404402379518}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.04013687657396167, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011383641303593693}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.0650129563997214, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017038204023901537}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.04375734535627615, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010346234714461327}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.12283468138914441, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0019880477155036994}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.19794072747052693, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002794014068329132}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.13485492202044252, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017498822051336784}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.14708276519326932, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002450731932737247}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.23071550304096008, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003222080601723914}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.16017715054289589, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021714589994061635}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 3.1365248222604922, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11628221415118109}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.049068436227218, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001993000314908942}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.0775740659139895, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029232679441589045}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.051872966574056024, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019061734901385107}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.01199072000032627, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007398590703300153}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.0204455235276103, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012282519026501563}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.012765677291574816, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006703661625741191}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.038829622758228356, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015610696154773989}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.0634321576713279, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002441124896735242}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.041302516354620705, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014872448773892877}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.045154904255880535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018406863174026315}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.07128105012283013, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027050833760683883}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.04760963919336614, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017500349071780813}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 0.48599285663479985, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04001588316652065}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2022231395663083, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002327440444373304}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.3212542850069266, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028325100234559927}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.22687895073683426, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019722087479110005}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.05298207477606098, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011059450545409227}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0867754880829245, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018124553730314602}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05947240390341843, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011148780082595852}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.14766643493041728, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016788338597698632}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.24166027084574745, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023102550180774937}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1667829458147027, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013918547568942457}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.18955567145755783, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021818874452280475}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.30206706797305843, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026937255424983246}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.21293287161862917, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018545411601376014}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.26946391692664, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06690865495365421}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.17603411917572567, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027088089976067017}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2612353818883475, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0033992744560135033}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.18692561650624942, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0023335888686542303}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.04559336904702604, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012112817237957673}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.06990530615420809, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0017473693679177455}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04827221140963156, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001080517273254867}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.13188323013007386, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020767846091407608}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.19847294800292903, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002681619286646281}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13971894217937625, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017066104221784629}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.1648347813541367, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0025407224451518312}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.24568842200265278, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0032363366658664243}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.17522886133321208, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0021928506565670742}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 3.0982581904465443, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09308367989527225}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.05811148139738531, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022224537018629854}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.08558209366311444, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003003751048628007}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.058933509019544375, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002002584327257243}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.014672079613835081, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008619146516289935}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.02381134201045262, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012808609018768735}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.015382205296321172, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000747616978372682}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.04421230588130963, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017253556363268328}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.06610553705292643, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002375418784846169}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.04464141924307046, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015078323046653491}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.05425853906264316, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002080511020141031}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.07994658087674156, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0028206819757526707}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.05503716623523508, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018776883247322259}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.6565411757725536, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.052324362211905354}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.11186372200442046, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018671387732113079}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.17879552912162308, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027643379350067035}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.12691648074851333, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001896734902040353}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.01717084786190784, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006542423399459584}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.029277153235119115, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011365556453727173}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.019772682308363394, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0007124577758867392}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.08411171356077761, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012658897573498854}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.13787036917147713, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020240765955970865}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.09604059787645051, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001274636062224721}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.10367309622865617, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017302566309302105}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.16661914846277825, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025842506153635845}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.11782155638903533, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017597196977228665}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.0455288685483874, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.047635370761101796}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_3.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.09476998627330442, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002049387309515581}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.14233990548855194, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0028599756389382966}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.10224304189369153, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019828555273926772}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.01579253403515002, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000714960507379995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.025370051032367204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010971245243116302}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.017441204975488303, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006872287960301365}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.0726982989153829, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001524607663698719}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.11120691117595559, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021760779267226923}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.07842574917248762, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014189355674646296}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.08821003752248946, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0019202411693044791}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.13254544508118085, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002664772195130541}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.09504316855119928, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018454346298717522}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 1.2230608250631039, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08212157026799911}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.02400887856928141, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012801233338843008}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.03676848661205508, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0018777990211272366}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.0251536532172704, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012297831296573945}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.0040671748623580355, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004261458773707568}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.006848332649418315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0007065553357872231}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.004349993193557413, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003848818988530138}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.018902837570424644, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009887113843588492}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.029393904053863548, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0014777305097725733}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.019722039262009786, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0009212732368853081}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.02224536000159842, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012029315455249722}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.03371996491485194, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001725783795993067}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.023124878533727647, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001135976545095596}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 0.08481215880086364, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.008615953892796454}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.341, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014998131348402709}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.309, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014619600977206494}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_GPT-3-style_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc": 0.335, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014933117490932573}, {"task_name": "anli_r1", "prompt_name": "GPT-3 style", "acc_norm": 0.308, "fixed_answer_choice_list": ["True", "Neither", "False"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "620aa3fc-d5eb-46f5-a1ee-4c754527aa97", "prompt_jinja": "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014606483127342761}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.359, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015177264224798594}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.357, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015158521721486769}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_MNLI-crowdsource_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc": 0.347, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015060472031706617}, {"task_name": "anli_r1", "prompt_name": "MNLI crowdsource", "acc_norm": 0.333, "fixed_answer_choice_list": ["Correct", "Inconclusive", "Incorrect"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "0cc3ae39-3997-4686-8c93-5d51457efa1f", "prompt_jinja": "{{premise}} Using only the above description and what you know about the world, \"{{hypothesis}}\" is definitely correct, incorrect, or inconclusive? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01491084616422986}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014899597242811483}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.321, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014770821817934649}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_can-we-infer_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "anli_r1", "prompt_name": "can we infer", "acc": 0.324, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01480686473373886}, {"task_name": "anli_r1", "prompt_name": "can we infer", "acc_norm": 0.322, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "c4ed37ae-d7d7-4197-a725-ef2152fa3b1f", "prompt_jinja": "Suppose {{premise}} Can we infer that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014782913600996673}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.318, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014734079309311901}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.329, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014865395385928374}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_guaranteed-possible-impossible_5.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc": 0.322, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01478291360099667}, {"task_name": "anli_r1", "prompt_name": "guaranteed/possible/impossible", "acc_norm": 0.314, "fixed_answer_choice_list": ["Guaranteed", "Possible", "Impossible"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "ca24b93a-6265-462f-b140-e329c03d94fa", "prompt_jinja": "Assume it is true that {{premise}} \n\nTherefore, \"{{hypothesis}}\" is {{\"guaranteed\"}}, {{\"possible\"}}, or {{\"impossible\"}}? ||| {{ answer_choices[label] }}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01468399195108797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
4b284b84bc4/eval/agg.4b284b84bc4_anli_r1_justified-in-saying_4.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": [{"task_name": "anli_r1", "prompt_name": "justified in saying", "acc": 0.34, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_stderr": 0.014987482264363937}, {"task_name": "anli_r1", "prompt_name": "justified in saying", "acc_norm": 0.332, "fixed_answer_choice_list": ["Yes", "Maybe", "No"], "dataset_path": "anli", "dataset_name": null, "subset": 1, "prompt_id": "a850110d-f1a3-49b4-949a-d3bfe9f81344", "prompt_jinja": "{{premise}} Are we justified in saying that \"{{hypothesis}}\"? Yes, no, or maybe? ||| {{ answer_choices[label] }} ", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.014899597242811483}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}