File size: 13,891 Bytes
899c702 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
dataset,prompt,metric,value
amazon_reviews_multi_en,prompt_body_title_to_star,accuracy,0.5508
amazon_reviews_multi_en,prompt_review_to_star,accuracy,0.51
amazon_reviews_multi_en,prompt_title_to_star,accuracy,0.3758
amazon_reviews_multi_en,median,accuracy,0.51
amazon_reviews_multi_es,prompt_body_title_to_star,accuracy,0.4776
amazon_reviews_multi_es,prompt_review_to_star,accuracy,0.4444
amazon_reviews_multi_es,prompt_title_to_star,accuracy,0.3088
amazon_reviews_multi_es,median,accuracy,0.4444
amazon_reviews_multi_fr,prompt_body_title_to_star,accuracy,0.4742
amazon_reviews_multi_fr,prompt_review_to_star,accuracy,0.4492
amazon_reviews_multi_fr,prompt_title_to_star,accuracy,0.3192
amazon_reviews_multi_fr,median,accuracy,0.4492
amazon_reviews_multi_zh,prompt_body_title_to_star,accuracy,0.4712
amazon_reviews_multi_zh,prompt_review_to_star,accuracy,0.4478
amazon_reviews_multi_zh,prompt_title_to_star,accuracy,0.3208
amazon_reviews_multi_zh,median,accuracy,0.4478
aqua_rat_raw,Answer questions from options,accuracy,0.2440944881889764
aqua_rat_raw,answer_quiz,accuracy,0.23228346456692914
aqua_rat_raw,select_the_best_option,accuracy,0.25196850393700787
aqua_rat_raw,median,accuracy,0.2440944881889764
art_None,choose_hypothesis,accuracy,0.6109660574412533
art_None,choose_hypothesis_believable,accuracy,0.5926892950391645
art_None,choose_hypothesis_desc,accuracy,0.554177545691906
art_None,choose_hypothesis_likely,accuracy,0.5587467362924282
art_None,choose_hypothesis_options,accuracy,0.5842036553524804
art_None,median,accuracy,0.5842036553524804
banking77_None,direct_to_which_department,accuracy,0.19675324675324676
banking77_None,help_page_topic,accuracy,0.2538961038961039
banking77_None,rephrase_as_banking_term,accuracy,0.2636363636363636
banking77_None,median,accuracy,0.2538961038961039
blbooksgenre_title_genre_classifiction,classify,accuracy,0.27764976958525345
blbooksgenre_title_genre_classifiction,multi-choice,accuracy,0.8456221198156681
blbooksgenre_title_genre_classifiction,premise_context_first,accuracy,0.7494239631336406
blbooksgenre_title_genre_classifiction,median,accuracy,0.7494239631336406
blimp_adjunct_island,grammatical_between_1_2,accuracy,0.516
blimp_adjunct_island,grammatical_between_A_B,accuracy,0.428
blimp_adjunct_island,grammatical_which_one_1_2,accuracy,0.576
blimp_adjunct_island,single_sentence_bad_yes_no,accuracy,0.478
blimp_adjunct_island,single_sentence_good_yes_no,accuracy,0.499
blimp_adjunct_island,median,accuracy,0.499
climate_fever_None,claim_and_all_supporting_evidences,accuracy,0.4273615635179153
climate_fever_None,fifth_evidence_and_claim_itemization,accuracy,0.43973941368078173
climate_fever_None,first_evidence_and_claim_itemization,accuracy,0.3283387622149837
climate_fever_None,second_evidence_and_claim_itemization,accuracy,0.47687296416938113
climate_fever_None,third_evidence_claim_pair,accuracy,0.4586319218241042
climate_fever_None,median,accuracy,0.43973941368078173
codah_codah,affirmative_instruction_after_sentence_and_choices,accuracy,0.7327089337175793
codah_codah,affirmative_instruction_before_sentence_and_choices,accuracy,0.7359510086455331
codah_codah,interrogative_instruction_after_sentence_and_choices,accuracy,0.736671469740634
codah_codah,median,accuracy,0.7359510086455331
commonsense_qa_None,answer_given_question_without_options,accuracy,0.6224406224406225
commonsense_qa_None,most_suitable_answer,accuracy,0.8435708435708436
commonsense_qa_None,question_answering,accuracy,0.8304668304668305
commonsense_qa_None,median,accuracy,0.8304668304668305
conv_ai_3_None,ambiguous,accuracy,0.39040207522697795
conv_ai_3_None,clarification_needed,accuracy,0.39040207522697795
conv_ai_3_None,directly_answer,accuracy,0.6095979247730221
conv_ai_3_None,score_give_number,accuracy,0.37959360138348464
conv_ai_3_None,score_how_much,accuracy,0.03285776048421963
conv_ai_3_None,median,accuracy,0.39040207522697795
craigslist_bargains_None,best deal,accuracy,0.49246231155778897
craigslist_bargains_None,good deal for seller,accuracy,0.4371859296482412
craigslist_bargains_None,good deal for seller no list price,accuracy,0.6046901172529313
craigslist_bargains_None,good deal for seller no list price implicit,accuracy,0.25963149078726966
craigslist_bargains_None,median,accuracy,0.4648241206030151
emotion_None,answer_question_with_emotion_label,accuracy,0.344
emotion_None,answer_with_class_label,accuracy,0.2295
emotion_None,choose_the_best_emotion_label,accuracy,0.317
emotion_None,reply_with_emoation_label,accuracy,0.5025
emotion_None,median,accuracy,0.3305
financial_phrasebank_sentences_allagree,bullish_neutral_bearish,accuracy,0.3710247349823322
financial_phrasebank_sentences_allagree,complementary_industries,accuracy,0.04637809187279152
financial_phrasebank_sentences_allagree,sentiment,accuracy,0.3489399293286219
financial_phrasebank_sentences_allagree,share_price_option,accuracy,0.37146643109540634
financial_phrasebank_sentences_allagree,word_comes_to_mind,accuracy,0.01987632508833922
financial_phrasebank_sentences_allagree,median,accuracy,0.3489399293286219
glue_cola,Following sentence acceptable,accuracy,0.5685522531160115
glue_cola,Make sense yes no,accuracy,0.3326941514860978
glue_cola,Previous sentence acceptable,accuracy,0.3096836049856184
glue_cola,editing,accuracy,0.3144774688398849
glue_cola,is_this_correct,accuracy,0.4592521572387344
glue_cola,median,accuracy,0.3326941514860978
glue_sst2,following positive negative,accuracy,0.9415137614678899
glue_sst2,happy or mad,accuracy,0.9013761467889908
glue_sst2,positive negative after,accuracy,0.9461009174311926
glue_sst2,review,accuracy,0.9403669724770642
glue_sst2,said,accuracy,0.9185779816513762
glue_sst2,median,accuracy,0.9403669724770642
head_qa_en,multiple_choice_a_and_q_en,accuracy,0.27379209370424595
head_qa_en,multiple_choice_a_and_q_with_context_en,accuracy,0.2730600292825769
head_qa_en,multiple_choice_q_and_a_en,accuracy,0.40922401171303074
head_qa_en,multiple_choice_q_and_a_index_en,accuracy,0.3916544655929722
head_qa_en,multiple_choice_q_and_a_index_with_context_en,accuracy,0.3857979502196193
head_qa_en,median,accuracy,0.3857979502196193
head_qa_es,multiple_choice_a_and_q_en,accuracy,0.2679355783308931
head_qa_es,multiple_choice_a_and_q_with_context_en,accuracy,0.2642752562225476
head_qa_es,multiple_choice_q_and_a_en,accuracy,0.39751098096632503
head_qa_es,multiple_choice_q_and_a_index_en,accuracy,0.3506588579795022
head_qa_es,multiple_choice_q_and_a_index_with_context_en,accuracy,0.3440702781844802
head_qa_es,median,accuracy,0.3440702781844802
health_fact_None,claim_explanation_classification,accuracy,0.5755102040816327
health_fact_None,claim_veracity_classification_after_reading_I_believe,accuracy,0.31510204081632653
health_fact_None,claim_veracity_classification_tell_me,accuracy,0.053877551020408164
health_fact_None,median,accuracy,0.31510204081632653
hlgd_None,is_same_event_editor_asks,accuracy,0.6230062832286128
hlgd_None,is_same_event_interrogative_talk,accuracy,0.7056549057515709
hlgd_None,is_same_event_refer,accuracy,0.6457225712904785
hlgd_None,is_same_event_with_time_interrogative_related,accuracy,0.7873368777187046
hlgd_None,is_same_event_with_time_interrogative_talk,accuracy,0.8182696955050749
hlgd_None,median,accuracy,0.7056549057515709
hyperpartisan_news_detection_byarticle,consider_does_it_follow_a_hyperpartisan_argumentation,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,consider_it_exhibits_extreme_one_sidedness,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,consume_with_caution,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,extreme_left_wing_or_right_wing,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,follows_hyperpartisan_argumentation,accuracy,0.6310077519379845
hyperpartisan_news_detection_byarticle,median,accuracy,0.6310077519379845
liar_None,Given statement guess category,accuracy,0.20794392523364486
liar_None,median,accuracy,0.20794392523364486
lince_sa_spaeng,express sentiment,accuracy,0.5814954276492738
lince_sa_spaeng,negation template,accuracy,0.16621839698762775
lince_sa_spaeng,original poster expressed sentiment,accuracy,0.5831091984938139
lince_sa_spaeng,sentiment trying to express,accuracy,0.584722969338354
lince_sa_spaeng,the author seem,accuracy,0.5691231845077999
lince_sa_spaeng,median,accuracy,0.5814954276492738
math_qa_None,choose_correct_og,accuracy,0.2100502512562814
math_qa_None,first_choice_then_problem,accuracy,0.21708542713567838
math_qa_None,gre_problem,accuracy,0.20871021775544388
math_qa_None,pick_the_correct,accuracy,0.21139028475711893
math_qa_None,problem_set_type,accuracy,0.38123953098827473
math_qa_None,median,accuracy,0.21139028475711893
mlsum_es,layman_summ_es,bleu,0.03612948631805906
mlsum_es,palm_prompt,bleu,0.048150532126973386
mlsum_es,summarise_this_in_es_few_sentences,bleu,0.03385324638634216
mlsum_es,median,bleu,0.03612948631805906
movie_rationales_None,Evidences + review,accuracy,0.975
movie_rationales_None,Evidences sentiment classification,accuracy,0.99
movie_rationales_None,Standard binary sentiment analysis,accuracy,0.95
movie_rationales_None,median,accuracy,0.975
mwsc_None,in-the-sentence,accuracy,0.5853658536585366
mwsc_None,in-the-sentence-question-first,accuracy,0.6219512195121951
mwsc_None,is-correct,accuracy,0.5487804878048781
mwsc_None,options-or,accuracy,0.6951219512195121
mwsc_None,what-think,accuracy,0.6951219512195121
mwsc_None,median,accuracy,0.6219512195121951
onestop_english_None,ara_context,accuracy,0.2945326278659612
onestop_english_None,assess,accuracy,0.54673721340388
onestop_english_None,determine_reading_level_from_the_first_three_sentences,accuracy,0.37918871252204583
onestop_english_None,esl_context,accuracy,0.6402116402116402
onestop_english_None,esl_variation,accuracy,0.5961199294532628
onestop_english_None,median,accuracy,0.54673721340388
poem_sentiment_None,guess_sentiment_without_options_variation_1,accuracy,0.21904761904761905
poem_sentiment_None,most_appropriate_sentiment,accuracy,0.29523809523809524
poem_sentiment_None,positive_or_negative_sentiment_variation_1,accuracy,0.21904761904761905
poem_sentiment_None,positive_or_negative_sentiment_variation_2,accuracy,0.22857142857142856
poem_sentiment_None,question_answer_format,accuracy,0.2571428571428571
poem_sentiment_None,median,accuracy,0.22857142857142856
pubmed_qa_pqa_labeled,Long Answer to Final Decision,accuracy,0.648
pubmed_qa_pqa_labeled,Question Answering (Short),accuracy,0.695
pubmed_qa_pqa_labeled,median,accuracy,0.6715
riddle_sense_None,answer_given_question_without_options,accuracy,0.48090107737512244
riddle_sense_None,most_suitable_answer,accuracy,0.40254652301665034
riddle_sense_None,question_answering,accuracy,0.3868756121449559
riddle_sense_None,question_to_answer_index,accuracy,0.3702252693437806
riddle_sense_None,median,accuracy,0.3947110675808031
scicite_None,Classify intent,accuracy,0.20414847161572053
scicite_None,Classify intent (choices first),accuracy,0.21069868995633187
scicite_None,Classify intent (select choice),accuracy,0.45414847161572053
scicite_None,Classify intent w/section (select choice),accuracy,0.5032751091703057
scicite_None,can_describe,accuracy,0.34279475982532753
scicite_None,median,accuracy,0.34279475982532753
selqa_answer_selection_analysis,is-he-talking-about,accuracy,0.9031847133757962
selqa_answer_selection_analysis,make-sense-rand,accuracy,0.8815286624203822
selqa_answer_selection_analysis,which-answer-1st-vs-random,accuracy,0.8726114649681529
selqa_answer_selection_analysis,would-make-sense-qu-rand,accuracy,0.9121019108280255
selqa_answer_selection_analysis,median,accuracy,0.8923566878980892
snips_built_in_intents_None,categorize_query,accuracy,0.39939024390243905
snips_built_in_intents_None,categorize_query_brief,accuracy,0.36585365853658536
snips_built_in_intents_None,intent_query,accuracy,0.31097560975609756
snips_built_in_intents_None,query_intent,accuracy,0.5823170731707317
snips_built_in_intents_None,voice_intent,accuracy,0.5762195121951219
snips_built_in_intents_None,median,accuracy,0.39939024390243905
wmt14_fr_en_en-fr,a_good_translation-en-fr-source+target,bleu,0.03901997019133066
wmt14_fr_en_en-fr,a_good_translation-en-fr-target,bleu,0.013934207960053381
wmt14_fr_en_en-fr,gpt3-en-fr,bleu,0.0008726814351547542
wmt14_fr_en_en-fr,version-en-fr-target,bleu,0.04126763289443808
wmt14_fr_en_en-fr,xglm-en-fr-target,bleu,0.2594147632125033
wmt14_fr_en_en-fr,median,bleu,0.03901997019133066
wmt14_fr_en_fr-en,a_good_translation-fr-en-source+target,bleu,0.29535567491027065
wmt14_fr_en_fr-en,a_good_translation-fr-en-target,bleu,0.10053995021986518
wmt14_fr_en_fr-en,gpt3-fr-en,bleu,0.05996411710924088
wmt14_fr_en_fr-en,version-fr-en-target,bleu,0.2543366934119538
wmt14_fr_en_fr-en,xglm-fr-en-target,bleu,0.289915194963351
wmt14_fr_en_fr-en,median,bleu,0.2543366934119538
wmt14_hi_en_en-hi,a_good_translation-en-hi-source+target,bleu,0.006990276538877561
wmt14_hi_en_en-hi,a_good_translation-en-hi-target,bleu,0.0018050206530453908
wmt14_hi_en_en-hi,gpt-3-en-hi-target,bleu,2.984520737729336e-10
wmt14_hi_en_en-hi,version-en-hi-target,bleu,0.007268866226269155
wmt14_hi_en_en-hi,xglm-en-hi-target,bleu,0.06785861030301621
wmt14_hi_en_en-hi,median,bleu,0.006990276538877561
wmt14_hi_en_hi-en,a_good_translation-hi-en-source+target,bleu,0.15724256465201472
wmt14_hi_en_hi-en,a_good_translation-hi-en-target,bleu,0.06515805969434861
wmt14_hi_en_hi-en,gpt-3-hi-en-target,bleu,1.9706666216345307e-162
wmt14_hi_en_hi-en,version-hi-en-target,bleu,0.15422032309127792
wmt14_hi_en_hi-en,xglm-hi-en-target,bleu,0.17022583047573708
wmt14_hi_en_hi-en,median,bleu,0.15422032309127792
multiple,average,multiple,0.4485518661820451
|