File size: 10,544 Bytes
9f02120 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
dataset,prompt,metric,value
anli_dev_r1,GPT-3 style,accuracy,0.426
anli_dev_r1,MNLI crowdsource,accuracy,0.402
anli_dev_r1,can we infer,accuracy,0.401
anli_dev_r1,guaranteed/possible/impossible,accuracy,0.314
anli_dev_r1,justified in saying,accuracy,0.387
anli_dev_r1,median,accuracy,0.401
anli_dev_r2,GPT-3 style,accuracy,0.383
anli_dev_r2,MNLI crowdsource,accuracy,0.374
anli_dev_r2,can we infer,accuracy,0.394
anli_dev_r2,guaranteed/possible/impossible,accuracy,0.302
anli_dev_r2,justified in saying,accuracy,0.376
anli_dev_r2,median,accuracy,0.376
anli_dev_r3,GPT-3 style,accuracy,0.42
anli_dev_r3,MNLI crowdsource,accuracy,0.4116666666666667
anli_dev_r3,can we infer,accuracy,0.38916666666666666
anli_dev_r3,guaranteed/possible/impossible,accuracy,0.2966666666666667
anli_dev_r3,justified in saying,accuracy,0.35833333333333334
anli_dev_r3,median,accuracy,0.38916666666666666
story_cloze_2016,Answer Given options,accuracy,0.8524853019775521
story_cloze_2016,Choose Story Ending,accuracy,0.8957776590058792
story_cloze_2016,Generate Ending,accuracy,0.709246392303581
story_cloze_2016,Novel Correct Ending,accuracy,0.8888295029396045
story_cloze_2016,Story Continuation and Options,accuracy,0.8850881881346874
story_cloze_2016,median,accuracy,0.8850881881346874
super_glue_cb,GPT-3 style,accuracy,0.8392857142857143
super_glue_cb,MNLI crowdsource,accuracy,0.35714285714285715
super_glue_cb,can we infer,accuracy,0.7857142857142857
super_glue_cb,guaranteed/possible/impossible,accuracy,0.5535714285714286
super_glue_cb,justified in saying,accuracy,0.7142857142857143
super_glue_cb,median,accuracy,0.7142857142857143
super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.66
super_glue_copa,best_option,accuracy,0.77
super_glue_copa,cause_effect,accuracy,0.8
super_glue_copa,i_am_hesitating,accuracy,0.81
super_glue_copa,plausible_alternatives,accuracy,0.84
super_glue_copa,median,accuracy,0.8
super_glue_rte,GPT-3 style,accuracy,0.7906137184115524
super_glue_rte,MNLI crowdsource,accuracy,0.8267148014440433
super_glue_rte,does it follow that,accuracy,0.7942238267148014
super_glue_rte,guaranteed true,accuracy,0.776173285198556
super_glue_rte,should assume,accuracy,0.7617328519855595
super_glue_rte,median,accuracy,0.7906137184115524
winogrande_winogrande_xl,Replace,accuracy,0.5588003157063931
winogrande_winogrande_xl,True or False,accuracy,0.5280189423835833
winogrande_winogrande_xl,does underscore refer to,accuracy,0.5651144435674822
winogrande_winogrande_xl,stand for,accuracy,0.5082872928176796
winogrande_winogrande_xl,underscore refer to,accuracy,0.5651144435674822
winogrande_winogrande_xl,median,accuracy,0.5588003157063931
xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.46
xcopa_id,best_option,accuracy,0.7
xcopa_id,cause_effect,accuracy,0.73
xcopa_id,i_am_hesitating,accuracy,0.72
xcopa_id,plausible_alternatives,accuracy,0.67
xcopa_id,median,accuracy,0.7
xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.6
xcopa_sw,best_option,accuracy,0.55
xcopa_sw,cause_effect,accuracy,0.54
xcopa_sw,i_am_hesitating,accuracy,0.51
xcopa_sw,plausible_alternatives,accuracy,0.52
xcopa_sw,median,accuracy,0.54
xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.59
xcopa_ta,best_option,accuracy,0.56
xcopa_ta,cause_effect,accuracy,0.6
xcopa_ta,i_am_hesitating,accuracy,0.57
xcopa_ta,plausible_alternatives,accuracy,0.62
xcopa_ta,median,accuracy,0.59
xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.53
xcopa_vi,best_option,accuracy,0.72
xcopa_vi,cause_effect,accuracy,0.72
xcopa_vi,i_am_hesitating,accuracy,0.7
xcopa_vi,plausible_alternatives,accuracy,0.71
xcopa_vi,median,accuracy,0.71
xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.67
xcopa_zh,best_option,accuracy,0.7
xcopa_zh,cause_effect,accuracy,0.8
xcopa_zh,i_am_hesitating,accuracy,0.77
xcopa_zh,plausible_alternatives,accuracy,0.79
xcopa_zh,median,accuracy,0.77
xnli_ar,GPT-3 style,accuracy,0.5558232931726907
xnli_ar,MNLI crowdsource,accuracy,0.42128514056224897
xnli_ar,can we infer,accuracy,0.5148594377510041
xnli_ar,guaranteed/possible/impossible,accuracy,0.40562248995983935
xnli_ar,justified in saying,accuracy,0.4927710843373494
xnli_ar,median,accuracy,0.4927710843373494
xnli_en,GPT-3 style,accuracy,0.5891566265060241
xnli_en,MNLI crowdsource,accuracy,0.42610441767068274
xnli_en,can we infer,accuracy,0.5662650602409639
xnli_en,guaranteed/possible/impossible,accuracy,0.4614457831325301
xnli_en,justified in saying,accuracy,0.5437751004016064
xnli_en,median,accuracy,0.5437751004016064
xnli_es,GPT-3 style,accuracy,0.5734939759036145
xnli_es,MNLI crowdsource,accuracy,0.40923694779116465
xnli_es,can we infer,accuracy,0.5148594377510041
xnli_es,guaranteed/possible/impossible,accuracy,0.43132530120481927
xnli_es,justified in saying,accuracy,0.4610441767068273
xnli_es,median,accuracy,0.4610441767068273
xnli_fr,GPT-3 style,accuracy,0.5666666666666667
xnli_fr,MNLI crowdsource,accuracy,0.42208835341365464
xnli_fr,can we infer,accuracy,0.5385542168674698
xnli_fr,guaranteed/possible/impossible,accuracy,0.39076305220883534
xnli_fr,justified in saying,accuracy,0.5100401606425703
xnli_fr,median,accuracy,0.5100401606425703
xnli_hi,GPT-3 style,accuracy,0.5345381526104418
xnli_hi,MNLI crowdsource,accuracy,0.41124497991967873
xnli_hi,can we infer,accuracy,0.4751004016064257
xnli_hi,guaranteed/possible/impossible,accuracy,0.40923694779116465
xnli_hi,justified in saying,accuracy,0.4469879518072289
xnli_hi,median,accuracy,0.4469879518072289
xnli_sw,GPT-3 style,accuracy,0.4827309236947791
xnli_sw,MNLI crowdsource,accuracy,0.40562248995983935
xnli_sw,can we infer,accuracy,0.44497991967871486
xnli_sw,guaranteed/possible/impossible,accuracy,0.42289156626506025
xnli_sw,justified in saying,accuracy,0.41124497991967873
xnli_sw,median,accuracy,0.42289156626506025
xnli_ur,GPT-3 style,accuracy,0.4947791164658635
xnli_ur,MNLI crowdsource,accuracy,0.39759036144578314
xnli_ur,can we infer,accuracy,0.4502008032128514
xnli_ur,guaranteed/possible/impossible,accuracy,0.39036144578313253
xnli_ur,justified in saying,accuracy,0.40843373493975904
xnli_ur,median,accuracy,0.40843373493975904
xnli_vi,GPT-3 style,accuracy,0.5449799196787148
xnli_vi,MNLI crowdsource,accuracy,0.40401606425702813
xnli_vi,can we infer,accuracy,0.5
xnli_vi,guaranteed/possible/impossible,accuracy,0.44779116465863456
xnli_vi,justified in saying,accuracy,0.4650602409638554
xnli_vi,median,accuracy,0.4650602409638554
xnli_zh,GPT-3 style,accuracy,0.5429718875502008
xnli_zh,MNLI crowdsource,accuracy,0.3891566265060241
xnli_zh,can we infer,accuracy,0.5032128514056224
xnli_zh,guaranteed/possible/impossible,accuracy,0.38072289156626504
xnli_zh,justified in saying,accuracy,0.4706827309236948
xnli_zh,median,accuracy,0.4706827309236948
xstory_cloze_ar,Answer Given options,accuracy,0.6896095301125083
xstory_cloze_ar,Choose Story Ending,accuracy,0.8378557246856386
xstory_cloze_ar,Generate Ending,accuracy,0.5956320317670417
xstory_cloze_ar,Novel Correct Ending,accuracy,0.8213103904698875
xstory_cloze_ar,Story Continuation and Options,accuracy,0.8219722038385175
xstory_cloze_ar,median,accuracy,0.8213103904698875
xstory_cloze_es,Answer Given options,accuracy,0.7683653209794837
xstory_cloze_es,Choose Story Ending,accuracy,0.886168100595632
xstory_cloze_es,Generate Ending,accuracy,0.6724023825281271
xstory_cloze_es,Novel Correct Ending,accuracy,0.8676373262739907
xstory_cloze_es,Story Continuation and Options,accuracy,0.8769027134348114
xstory_cloze_es,median,accuracy,0.8676373262739907
xstory_cloze_eu,Answer Given options,accuracy,0.6082064857710126
xstory_cloze_eu,Choose Story Ending,accuracy,0.7266710787557908
xstory_cloze_eu,Generate Ending,accuracy,0.5552614162806089
xstory_cloze_eu,Novel Correct Ending,accuracy,0.700198544010589
xstory_cloze_eu,Story Continuation and Options,accuracy,0.7107875579086698
xstory_cloze_eu,median,accuracy,0.700198544010589
xstory_cloze_hi,Answer Given options,accuracy,0.6366644606221046
xstory_cloze_hi,Choose Story Ending,accuracy,0.7882197220383852
xstory_cloze_hi,Generate Ending,accuracy,0.5982792852415619
xstory_cloze_hi,Novel Correct Ending,accuracy,0.7485109199205824
xstory_cloze_hi,Story Continuation and Options,accuracy,0.7683653209794837
xstory_cloze_hi,median,accuracy,0.7485109199205824
xstory_cloze_id,Answer Given options,accuracy,0.7385837193911317
xstory_cloze_id,Choose Story Ending,accuracy,0.8332230311052283
xstory_cloze_id,Generate Ending,accuracy,0.6293845135671741
xstory_cloze_id,Novel Correct Ending,accuracy,0.7816015883520847
xstory_cloze_id,Story Continuation and Options,accuracy,0.8226340172071476
xstory_cloze_id,median,accuracy,0.7816015883520847
xstory_cloze_zh,Answer Given options,accuracy,0.7498345466578424
xstory_cloze_zh,Choose Story Ending,accuracy,0.8583719391131701
xstory_cloze_zh,Generate Ending,accuracy,0.6227663798808736
xstory_cloze_zh,Novel Correct Ending,accuracy,0.8405029781601588
xstory_cloze_zh,Story Continuation and Options,accuracy,0.8385175380542687
xstory_cloze_zh,median,accuracy,0.8385175380542687
xwinograd_en,Replace,accuracy,0.6576344086021505
xwinograd_en,True or False,accuracy,0.5187096774193548
xwinograd_en,does underscore refer to,accuracy,0.5931182795698925
xwinograd_en,stand for,accuracy,0.5070967741935484
xwinograd_en,underscore refer to,accuracy,0.6210752688172043
xwinograd_en,median,accuracy,0.5931182795698925
xwinograd_fr,Replace,accuracy,0.5180722891566265
xwinograd_fr,True or False,accuracy,0.5301204819277109
xwinograd_fr,does underscore refer to,accuracy,0.5542168674698795
xwinograd_fr,stand for,accuracy,0.5180722891566265
xwinograd_fr,underscore refer to,accuracy,0.5421686746987951
xwinograd_fr,median,accuracy,0.5301204819277109
xwinograd_pt,Replace,accuracy,0.5741444866920152
xwinograd_pt,True or False,accuracy,0.4790874524714829
xwinograd_pt,does underscore refer to,accuracy,0.55893536121673
xwinograd_pt,stand for,accuracy,0.5209125475285171
xwinograd_pt,underscore refer to,accuracy,0.5437262357414449
xwinograd_pt,median,accuracy,0.5437262357414449
xwinograd_zh,Replace,accuracy,0.626984126984127
xwinograd_zh,True or False,accuracy,0.503968253968254
xwinograd_zh,does underscore refer to,accuracy,0.5436507936507936
xwinograd_zh,stand for,accuracy,0.49007936507936506
xwinograd_zh,underscore refer to,accuracy,0.5535714285714286
xwinograd_zh,median,accuracy,0.5436507936507936
multiple,average,multiple,0.6067197952551315
|