File size: 13,867 Bytes
f2124a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
dataset,prompt,metric,value
amazon_reviews_multi_en,prompt_body_title_to_star,accuracy,0.5416
amazon_reviews_multi_en,prompt_review_to_star,accuracy,0.509
amazon_reviews_multi_en,prompt_title_to_star,accuracy,0.364
amazon_reviews_multi_en,median,accuracy,0.509
amazon_reviews_multi_es,prompt_body_title_to_star,accuracy,0.4448
amazon_reviews_multi_es,prompt_review_to_star,accuracy,0.4326
amazon_reviews_multi_es,prompt_title_to_star,accuracy,0.2802
amazon_reviews_multi_es,median,accuracy,0.4326
amazon_reviews_multi_fr,prompt_body_title_to_star,accuracy,0.449
amazon_reviews_multi_fr,prompt_review_to_star,accuracy,0.4392
amazon_reviews_multi_fr,prompt_title_to_star,accuracy,0.3128
amazon_reviews_multi_fr,median,accuracy,0.4392
amazon_reviews_multi_zh,prompt_body_title_to_star,accuracy,0.421
amazon_reviews_multi_zh,prompt_review_to_star,accuracy,0.4048
amazon_reviews_multi_zh,prompt_title_to_star,accuracy,0.302
amazon_reviews_multi_zh,median,accuracy,0.4048
aqua_rat_raw,Answer questions from options,accuracy,0.2755905511811024
aqua_rat_raw,answer_quiz,accuracy,0.2677165354330709
aqua_rat_raw,select_the_best_option,accuracy,0.28346456692913385
aqua_rat_raw,median,accuracy,0.2755905511811024
art_None,choose_hypothesis,accuracy,0.6742819843342036
art_None,choose_hypothesis_believable,accuracy,0.6677545691906005
art_None,choose_hypothesis_desc,accuracy,0.5515665796344648
art_None,choose_hypothesis_likely,accuracy,0.5737597911227154
art_None,choose_hypothesis_options,accuracy,0.6657963446475196
art_None,median,accuracy,0.6657963446475196
banking77_None,direct_to_which_department,accuracy,0.16688311688311688
banking77_None,help_page_topic,accuracy,0.2857142857142857
banking77_None,rephrase_as_banking_term,accuracy,0.2905844155844156
banking77_None,median,accuracy,0.2857142857142857
blbooksgenre_title_genre_classifiction,classify,accuracy,0.3127880184331797
blbooksgenre_title_genre_classifiction,multi-choice,accuracy,0.8640552995391705
blbooksgenre_title_genre_classifiction,premise_context_first,accuracy,0.7707373271889401
blbooksgenre_title_genre_classifiction,median,accuracy,0.7707373271889401
blimp_adjunct_island,grammatical_between_1_2,accuracy,0.466
blimp_adjunct_island,grammatical_between_A_B,accuracy,0.327
blimp_adjunct_island,grammatical_which_one_1_2,accuracy,0.498
blimp_adjunct_island,single_sentence_bad_yes_no,accuracy,0.507
blimp_adjunct_island,single_sentence_good_yes_no,accuracy,0.516
blimp_adjunct_island,median,accuracy,0.498
climate_fever_None,claim_and_all_supporting_evidences,accuracy,0.24299674267100976
climate_fever_None,fifth_evidence_and_claim_itemization,accuracy,0.36612377850162864
climate_fever_None,first_evidence_and_claim_itemization,accuracy,0.15895765472312703
climate_fever_None,second_evidence_and_claim_itemization,accuracy,0.14788273615635178
climate_fever_None,third_evidence_claim_pair,accuracy,0.18631921824104233
climate_fever_None,median,accuracy,0.18631921824104233
codah_codah,affirmative_instruction_after_sentence_and_choices,accuracy,0.8065561959654178
codah_codah,affirmative_instruction_before_sentence_and_choices,accuracy,0.7885446685878963
codah_codah,interrogative_instruction_after_sentence_and_choices,accuracy,0.8090778097982709
codah_codah,median,accuracy,0.8065561959654178
commonsense_qa_None,answer_given_question_without_options,accuracy,0.7018837018837019
commonsense_qa_None,most_suitable_answer,accuracy,0.8304668304668305
commonsense_qa_None,question_answering,accuracy,0.8026208026208026
commonsense_qa_None,median,accuracy,0.8026208026208026
conv_ai_3_None,ambiguous,accuracy,0.39040207522697795
conv_ai_3_None,clarification_needed,accuracy,0.39040207522697795
conv_ai_3_None,directly_answer,accuracy,0.6095979247730221
conv_ai_3_None,score_give_number,accuracy,0.21444012105490703
conv_ai_3_None,score_how_much,accuracy,0.21444012105490703
conv_ai_3_None,median,accuracy,0.39040207522697795
craigslist_bargains_None,best deal,accuracy,0.5175879396984925
craigslist_bargains_None,good deal for seller,accuracy,0.2864321608040201
craigslist_bargains_None,good deal for seller no list price,accuracy,0.16917922948073702
craigslist_bargains_None,good deal for seller no list price implicit,accuracy,0.24288107202680068
craigslist_bargains_None,median,accuracy,0.2646566164154104
emotion_None,answer_question_with_emotion_label,accuracy,0.3675
emotion_None,answer_with_class_label,accuracy,0.1445
emotion_None,choose_the_best_emotion_label,accuracy,0.3665
emotion_None,reply_with_emoation_label,accuracy,0.452
emotion_None,median,accuracy,0.367
financial_phrasebank_sentences_allagree,bullish_neutral_bearish,accuracy,0.24823321554770317
financial_phrasebank_sentences_allagree,complementary_industries,accuracy,0.0627208480565371
financial_phrasebank_sentences_allagree,sentiment,accuracy,0.3630742049469965
financial_phrasebank_sentences_allagree,share_price_option,accuracy,0.37234982332155475
financial_phrasebank_sentences_allagree,word_comes_to_mind,accuracy,0.05830388692579505
financial_phrasebank_sentences_allagree,median,accuracy,0.24823321554770317
glue_cola,Following sentence acceptable,accuracy,0.50143815915628
glue_cola,Make sense yes no,accuracy,0.6337488015340365
glue_cola,Previous sentence acceptable,accuracy,0.3461169702780441
glue_cola,editing,accuracy,0.4458293384467881
glue_cola,is_this_correct,accuracy,0.4228187919463087
glue_cola,median,accuracy,0.4458293384467881
glue_sst2,following positive negative,accuracy,0.944954128440367
glue_sst2,happy or mad,accuracy,0.9334862385321101
glue_sst2,positive negative after,accuracy,0.9392201834862385
glue_sst2,review,accuracy,0.9506880733944955
glue_sst2,said,accuracy,0.819954128440367
glue_sst2,median,accuracy,0.9392201834862385
head_qa_en,multiple_choice_a_and_q_en,accuracy,0.32430453879941434
head_qa_en,multiple_choice_a_and_q_with_context_en,accuracy,0.3330893118594436
head_qa_en,multiple_choice_q_and_a_en,accuracy,0.5395314787701317
head_qa_en,multiple_choice_q_and_a_index_en,accuracy,0.5314787701317716
head_qa_en,multiple_choice_q_and_a_index_with_context_en,accuracy,0.5380673499267935
head_qa_en,median,accuracy,0.5314787701317716
head_qa_es,multiple_choice_a_and_q_en,accuracy,0.3213762811127379
head_qa_es,multiple_choice_a_and_q_with_context_en,accuracy,0.32723279648609077
head_qa_es,multiple_choice_q_and_a_en,accuracy,0.5080527086383602
head_qa_es,multiple_choice_q_and_a_index_en,accuracy,0.5175695461200586
head_qa_es,multiple_choice_q_and_a_index_with_context_en,accuracy,0.5153733528550513
head_qa_es,median,accuracy,0.5080527086383602
health_fact_None,claim_explanation_classification,accuracy,0.6130612244897959
health_fact_None,claim_veracity_classification_after_reading_I_believe,accuracy,0.4791836734693877
health_fact_None,claim_veracity_classification_tell_me,accuracy,0.052244897959183675
health_fact_None,median,accuracy,0.4791836734693877
hlgd_None,is_same_event_editor_asks,accuracy,0.5360077332044466
hlgd_None,is_same_event_interrogative_talk,accuracy,0.6549057515708071
hlgd_None,is_same_event_refer,accuracy,0.7114548090865153
hlgd_None,is_same_event_with_time_interrogative_related,accuracy,0.6756887385210246
hlgd_None,is_same_event_with_time_interrogative_talk,accuracy,0.7844369260512325
hlgd_None,median,accuracy,0.6756887385210246
hyperpartisan_news_detection_byarticle,consider_does_it_follow_a_hyperpartisan_argumentation,accuracy,0.6372093023255814
hyperpartisan_news_detection_byarticle,consider_it_exhibits_extreme_one_sidedness,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,consume_with_caution,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,extreme_left_wing_or_right_wing,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,follows_hyperpartisan_argumentation,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,median,accuracy,0.6310077519379845
liar_None,Given statement guess category,accuracy,0.2087227414330218
liar_None,median,accuracy,0.2087227414330218
lince_sa_spaeng,express sentiment,accuracy,0.5960193652501344
lince_sa_spaeng,negation template,accuracy,0.36847767616998384
lince_sa_spaeng,original poster expressed sentiment,accuracy,0.6008606777837547
lince_sa_spaeng,sentiment trying to express,accuracy,0.5954814416352878
lince_sa_spaeng,the author seem,accuracy,0.5965572888649812
lince_sa_spaeng,median,accuracy,0.5960193652501344
math_qa_None,choose_correct_og,accuracy,0.22981574539363483
math_qa_None,first_choice_then_problem,accuracy,0.192964824120603
math_qa_None,gre_problem,accuracy,0.2184254606365159
math_qa_None,pick_the_correct,accuracy,0.2150753768844221
math_qa_None,problem_set_type,accuracy,0.4737018425460637
math_qa_None,median,accuracy,0.2184254606365159
mlsum_es,layman_summ_es,bleu,0.036061261250491146
mlsum_es,palm_prompt,bleu,0.04155428402841844
mlsum_es,summarise_this_in_es_few_sentences,bleu,0.027821053236675306
mlsum_es,median,bleu,0.036061261250491146
movie_rationales_None,Evidences + review,accuracy,0.985
movie_rationales_None,Evidences sentiment classification,accuracy,0.995
movie_rationales_None,Standard binary sentiment analysis,accuracy,0.955
movie_rationales_None,median,accuracy,0.985
mwsc_None,in-the-sentence,accuracy,0.6829268292682927
mwsc_None,in-the-sentence-question-first,accuracy,0.6585365853658537
mwsc_None,is-correct,accuracy,0.7195121951219512
mwsc_None,options-or,accuracy,0.8048780487804879
mwsc_None,what-think,accuracy,0.7682926829268293
mwsc_None,median,accuracy,0.7195121951219512
onestop_english_None,ara_context,accuracy,0.4673721340388007
onestop_english_None,assess,accuracy,0.3350970017636684
onestop_english_None,determine_reading_level_from_the_first_three_sentences,accuracy,0.5308641975308642
onestop_english_None,esl_context,accuracy,0.41798941798941797
onestop_english_None,esl_variation,accuracy,0.3386243386243386
onestop_english_None,median,accuracy,0.41798941798941797
poem_sentiment_None,guess_sentiment_without_options_variation_1,accuracy,0.20952380952380953
poem_sentiment_None,most_appropriate_sentiment,accuracy,0.23809523809523808
poem_sentiment_None,positive_or_negative_sentiment_variation_1,accuracy,0.23809523809523808
poem_sentiment_None,positive_or_negative_sentiment_variation_2,accuracy,0.23809523809523808
poem_sentiment_None,question_answer_format,accuracy,0.24761904761904763
poem_sentiment_None,median,accuracy,0.23809523809523808
pubmed_qa_pqa_labeled,Long Answer to Final Decision,accuracy,0.704
pubmed_qa_pqa_labeled,Question Answering (Short),accuracy,0.744
pubmed_qa_pqa_labeled,median,accuracy,0.724
riddle_sense_None,answer_given_question_without_options,accuracy,0.5925563173359452
riddle_sense_None,most_suitable_answer,accuracy,0.5161606268364348
riddle_sense_None,question_answering,accuracy,0.47502448579823703
riddle_sense_None,question_to_answer_index,accuracy,0.49657198824681686
riddle_sense_None,median,accuracy,0.5063663075416258
scicite_None,Classify intent,accuracy,0.6266375545851528
scicite_None,Classify intent (choices first),accuracy,0.4705240174672489
scicite_None,Classify intent (select choice),accuracy,0.4388646288209607
scicite_None,Classify intent w/section (select choice),accuracy,0.5491266375545851
scicite_None,can_describe,accuracy,0.6342794759825328
scicite_None,median,accuracy,0.5491266375545851
selqa_answer_selection_analysis,is-he-talking-about,accuracy,0.9184713375796179
selqa_answer_selection_analysis,make-sense-rand,accuracy,0.9426751592356688
selqa_answer_selection_analysis,which-answer-1st-vs-random,accuracy,0.9006369426751593
selqa_answer_selection_analysis,would-make-sense-qu-rand,accuracy,0.910828025477707
selqa_answer_selection_analysis,median,accuracy,0.9146496815286624
snips_built_in_intents_None,categorize_query,accuracy,0.7865853658536586
snips_built_in_intents_None,categorize_query_brief,accuracy,0.7012195121951219
snips_built_in_intents_None,intent_query,accuracy,0.4176829268292683
snips_built_in_intents_None,query_intent,accuracy,0.7835365853658537
snips_built_in_intents_None,voice_intent,accuracy,0.7012195121951219
snips_built_in_intents_None,median,accuracy,0.7012195121951219
wmt14_fr_en_en-fr,a_good_translation-en-fr-source+target,bleu,0.26028441633496957
wmt14_fr_en_en-fr,a_good_translation-en-fr-target,bleu,0.26105356968174953
wmt14_fr_en_en-fr,gpt3-en-fr,bleu,0.17923414272364485
wmt14_fr_en_en-fr,version-en-fr-target,bleu,0.23518794525011924
wmt14_fr_en_en-fr,xglm-en-fr-target,bleu,0.27490320032481685
wmt14_fr_en_en-fr,median,bleu,0.26028441633496957
wmt14_fr_en_fr-en,a_good_translation-fr-en-source+target,bleu,0.22344520948134364
wmt14_fr_en_fr-en,a_good_translation-fr-en-target,bleu,0.2988387938888211
wmt14_fr_en_fr-en,gpt3-fr-en,bleu,0.2897671081332691
wmt14_fr_en_fr-en,version-fr-en-target,bleu,0.3370883690137962
wmt14_fr_en_fr-en,xglm-fr-en-target,bleu,0.26028992585410116
wmt14_fr_en_fr-en,median,bleu,0.2897671081332691
wmt14_hi_en_en-hi,a_good_translation-en-hi-source+target,bleu,0.09550778502148496
wmt14_hi_en_en-hi,a_good_translation-en-hi-target,bleu,0.10547062820945455
wmt14_hi_en_en-hi,gpt-3-en-hi-target,bleu,0.034030829410154916
wmt14_hi_en_en-hi,version-en-hi-target,bleu,0.1149224530123302
wmt14_hi_en_en-hi,xglm-en-hi-target,bleu,0.06980407323250921
wmt14_hi_en_en-hi,median,bleu,0.09550778502148496
wmt14_hi_en_hi-en,a_good_translation-hi-en-source+target,bleu,0.04963973034828739
wmt14_hi_en_hi-en,a_good_translation-hi-en-target,bleu,0.11802320249982352
wmt14_hi_en_hi-en,gpt-3-hi-en-target,bleu,1.9401417583412615e-15
wmt14_hi_en_hi-en,version-hi-en-target,bleu,0.2117559943306028
wmt14_hi_en_hi-en,xglm-hi-en-target,bleu,0.1834661289471336
wmt14_hi_en_hi-en,median,bleu,0.11802320249982352
multiple,average,multiple,0.4784114531991768