Muennighoff's picture
Organize
8e048e8
raw history blame
No virus
10.5 kB
dataset,prompt,metric,value
anli_dev_r1,GPT-3 style,accuracy,0.497
anli_dev_r1,MNLI crowdsource,accuracy,0.442
anli_dev_r1,can we infer,accuracy,0.456
anli_dev_r1,guaranteed/possible/impossible,accuracy,0.328
anli_dev_r1,justified in saying,accuracy,0.46
anli_dev_r1,median,accuracy,0.456
anli_dev_r2,GPT-3 style,accuracy,0.45
anli_dev_r2,MNLI crowdsource,accuracy,0.382
anli_dev_r2,can we infer,accuracy,0.419
anli_dev_r2,guaranteed/possible/impossible,accuracy,0.345
anli_dev_r2,justified in saying,accuracy,0.41
anli_dev_r2,median,accuracy,0.41
anli_dev_r3,GPT-3 style,accuracy,0.4558333333333333
anli_dev_r3,MNLI crowdsource,accuracy,0.41333333333333333
anli_dev_r3,can we infer,accuracy,0.4225
anli_dev_r3,guaranteed/possible/impossible,accuracy,0.305
anli_dev_r3,justified in saying,accuracy,0.4083333333333333
anli_dev_r3,median,accuracy,0.41333333333333333
story_cloze_2016,Answer Given options,accuracy,0.9524318546231961
story_cloze_2016,Choose Story Ending,accuracy,0.9668626402993051
story_cloze_2016,Generate Ending,accuracy,0.7760555852485302
story_cloze_2016,Novel Correct Ending,accuracy,0.9583110636023516
story_cloze_2016,Story Continuation and Options,accuracy,0.9593800106894709
story_cloze_2016,median,accuracy,0.9583110636023516
super_glue_cb,GPT-3 style,accuracy,0.875
super_glue_cb,MNLI crowdsource,accuracy,0.35714285714285715
super_glue_cb,can we infer,accuracy,0.75
super_glue_cb,guaranteed/possible/impossible,accuracy,0.7678571428571429
super_glue_cb,justified in saying,accuracy,0.8035714285714286
super_glue_cb,median,accuracy,0.7678571428571429
super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.75
super_glue_copa,best_option,accuracy,0.87
super_glue_copa,cause_effect,accuracy,0.9
super_glue_copa,i_am_hesitating,accuracy,0.91
super_glue_copa,plausible_alternatives,accuracy,0.91
super_glue_copa,median,accuracy,0.9
super_glue_rte,GPT-3 style,accuracy,0.7870036101083032
super_glue_rte,MNLI crowdsource,accuracy,0.8592057761732852
super_glue_rte,does it follow that,accuracy,0.8194945848375451
super_glue_rte,guaranteed true,accuracy,0.7942238267148014
super_glue_rte,should assume,accuracy,0.8122743682310469
super_glue_rte,median,accuracy,0.8122743682310469
winogrande_winogrande_xl,Replace,accuracy,0.5998421468034728
winogrande_winogrande_xl,True or False,accuracy,0.5359116022099447
winogrande_winogrande_xl,does underscore refer to,accuracy,0.5864246250986582
winogrande_winogrande_xl,stand for,accuracy,0.5201262825572218
winogrande_winogrande_xl,underscore refer to,accuracy,0.5880031570639306
winogrande_winogrande_xl,median,accuracy,0.5864246250986582
xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.56
xcopa_id,best_option,accuracy,0.81
xcopa_id,cause_effect,accuracy,0.87
xcopa_id,i_am_hesitating,accuracy,0.83
xcopa_id,plausible_alternatives,accuracy,0.87
xcopa_id,median,accuracy,0.83
xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.6
xcopa_sw,best_option,accuracy,0.62
xcopa_sw,cause_effect,accuracy,0.64
xcopa_sw,i_am_hesitating,accuracy,0.66
xcopa_sw,plausible_alternatives,accuracy,0.64
xcopa_sw,median,accuracy,0.64
xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.59
xcopa_ta,best_option,accuracy,0.66
xcopa_ta,cause_effect,accuracy,0.7
xcopa_ta,i_am_hesitating,accuracy,0.69
xcopa_ta,plausible_alternatives,accuracy,0.64
xcopa_ta,median,accuracy,0.66
xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.58
xcopa_vi,best_option,accuracy,0.81
xcopa_vi,cause_effect,accuracy,0.91
xcopa_vi,i_am_hesitating,accuracy,0.85
xcopa_vi,plausible_alternatives,accuracy,0.84
xcopa_vi,median,accuracy,0.84
xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.57
xcopa_zh,best_option,accuracy,0.84
xcopa_zh,cause_effect,accuracy,0.86
xcopa_zh,i_am_hesitating,accuracy,0.86
xcopa_zh,plausible_alternatives,accuracy,0.81
xcopa_zh,median,accuracy,0.84
xnli_ar,GPT-3 style,accuracy,0.5578313253012048
xnli_ar,MNLI crowdsource,accuracy,0.41164658634538154
xnli_ar,can we infer,accuracy,0.5152610441767068
xnli_ar,guaranteed/possible/impossible,accuracy,0.5803212851405622
xnli_ar,justified in saying,accuracy,0.5184738955823294
xnli_ar,median,accuracy,0.5184738955823294
xnli_en,GPT-3 style,accuracy,0.6176706827309237
xnli_en,MNLI crowdsource,accuracy,0.4606425702811245
xnli_en,can we infer,accuracy,0.5714859437751004
xnli_en,guaranteed/possible/impossible,accuracy,0.6180722891566265
xnli_en,justified in saying,accuracy,0.5746987951807229
xnli_en,median,accuracy,0.5746987951807229
xnli_es,GPT-3 style,accuracy,0.5911646586345382
xnli_es,MNLI crowdsource,accuracy,0.43052208835341366
xnli_es,can we infer,accuracy,0.4397590361445783
xnli_es,guaranteed/possible/impossible,accuracy,0.5208835341365462
xnli_es,justified in saying,accuracy,0.41726907630522087
xnli_es,median,accuracy,0.4397590361445783
xnli_fr,GPT-3 style,accuracy,0.5911646586345382
xnli_fr,MNLI crowdsource,accuracy,0.4321285140562249
xnli_fr,can we infer,accuracy,0.5369477911646586
xnli_fr,guaranteed/possible/impossible,accuracy,0.5176706827309236
xnli_fr,justified in saying,accuracy,0.5385542168674698
xnli_fr,median,accuracy,0.5369477911646586
xnli_hi,GPT-3 style,accuracy,0.5208835341365462
xnli_hi,MNLI crowdsource,accuracy,0.3819277108433735
xnli_hi,can we infer,accuracy,0.44176706827309237
xnli_hi,guaranteed/possible/impossible,accuracy,0.5253012048192771
xnli_hi,justified in saying,accuracy,0.44377510040160645
xnli_hi,median,accuracy,0.44377510040160645
xnli_sw,GPT-3 style,accuracy,0.5036144578313253
xnli_sw,MNLI crowdsource,accuracy,0.3887550200803213
xnli_sw,can we infer,accuracy,0.44216867469879517
xnli_sw,guaranteed/possible/impossible,accuracy,0.38795180722891565
xnli_sw,justified in saying,accuracy,0.4397590361445783
xnli_sw,median,accuracy,0.4397590361445783
xnli_ur,GPT-3 style,accuracy,0.4907630522088353
xnli_ur,MNLI crowdsource,accuracy,0.37309236947791163
xnli_ur,can we infer,accuracy,0.45863453815261046
xnli_ur,guaranteed/possible/impossible,accuracy,0.5124497991967871
xnli_ur,justified in saying,accuracy,0.45582329317269077
xnli_ur,median,accuracy,0.45863453815261046
xnli_vi,GPT-3 style,accuracy,0.5582329317269076
xnli_vi,MNLI crowdsource,accuracy,0.42690763052208835
xnli_vi,can we infer,accuracy,0.4759036144578313
xnli_vi,guaranteed/possible/impossible,accuracy,0.5008032128514056
xnli_vi,justified in saying,accuracy,0.4827309236947791
xnli_vi,median,accuracy,0.4827309236947791
xnli_zh,GPT-3 style,accuracy,0.5550200803212851
xnli_zh,MNLI crowdsource,accuracy,0.4248995983935743
xnli_zh,can we infer,accuracy,0.43052208835341366
xnli_zh,guaranteed/possible/impossible,accuracy,0.5526104417670683
xnli_zh,justified in saying,accuracy,0.44016064257028115
xnli_zh,median,accuracy,0.44016064257028115
xstory_cloze_ar,Answer Given options,accuracy,0.7835870284579749
xstory_cloze_ar,Choose Story Ending,accuracy,0.9291859695565851
xstory_cloze_ar,Generate Ending,accuracy,0.6624751819986764
xstory_cloze_ar,Novel Correct Ending,accuracy,0.9252150893448048
xstory_cloze_ar,Story Continuation and Options,accuracy,0.9159497021839841
xstory_cloze_ar,median,accuracy,0.9159497021839841
xstory_cloze_es,Answer Given options,accuracy,0.870946393117141
xstory_cloze_es,Choose Story Ending,accuracy,0.9523494374586366
xstory_cloze_es,Generate Ending,accuracy,0.7319655857048313
xstory_cloze_es,Novel Correct Ending,accuracy,0.9477167438782264
xstory_cloze_es,Story Continuation and Options,accuracy,0.9516876240900066
xstory_cloze_es,median,accuracy,0.9477167438782264
xstory_cloze_eu,Answer Given options,accuracy,0.6982131039046989
xstory_cloze_eu,Choose Story Ending,accuracy,0.85704831237591
xstory_cloze_eu,Generate Ending,accuracy,0.614162806088683
xstory_cloze_eu,Novel Correct Ending,accuracy,0.8590337524818001
xstory_cloze_eu,Story Continuation and Options,accuracy,0.8504301786896096
xstory_cloze_eu,median,accuracy,0.8504301786896096
xstory_cloze_hi,Answer Given options,accuracy,0.7683653209794837
xstory_cloze_hi,Choose Story Ending,accuracy,0.8742554599602912
xstory_cloze_hi,Generate Ending,accuracy,0.657180675049636
xstory_cloze_hi,Novel Correct Ending,accuracy,0.886829913964262
xstory_cloze_hi,Story Continuation and Options,accuracy,0.8762409000661814
xstory_cloze_hi,median,accuracy,0.8742554599602912
xstory_cloze_id,Answer Given options,accuracy,0.8332230311052283
xstory_cloze_id,Choose Story Ending,accuracy,0.913964262078094
xstory_cloze_id,Generate Ending,accuracy,0.700198544010589
xstory_cloze_id,Novel Correct Ending,accuracy,0.9205823957643945
xstory_cloze_id,Story Continuation and Options,accuracy,0.9086697551290536
xstory_cloze_id,median,accuracy,0.9086697551290536
xstory_cloze_zh,Answer Given options,accuracy,0.870946393117141
xstory_cloze_zh,Choose Story Ending,accuracy,0.9265387160820648
xstory_cloze_zh,Generate Ending,accuracy,0.6823295830575777
xstory_cloze_zh,Novel Correct Ending,accuracy,0.928524156187955
xstory_cloze_zh,Story Continuation and Options,accuracy,0.9232296492389146
xstory_cloze_zh,median,accuracy,0.9232296492389146
xwinograd_en,Replace,accuracy,0.6933333333333334
xwinograd_en,True or False,accuracy,0.5212903225806451
xwinograd_en,does underscore refer to,accuracy,0.6563440860215054
xwinograd_en,stand for,accuracy,0.5156989247311828
xwinograd_en,underscore refer to,accuracy,0.6473118279569893
xwinograd_en,median,accuracy,0.6473118279569893
xwinograd_fr,Replace,accuracy,0.6024096385542169
xwinograd_fr,True or False,accuracy,0.46987951807228917
xwinograd_fr,does underscore refer to,accuracy,0.5903614457831325
xwinograd_fr,stand for,accuracy,0.4939759036144578
xwinograd_fr,underscore refer to,accuracy,0.6867469879518072
xwinograd_fr,median,accuracy,0.5903614457831325
xwinograd_pt,Replace,accuracy,0.6463878326996197
xwinograd_pt,True or False,accuracy,0.5285171102661597
xwinograd_pt,does underscore refer to,accuracy,0.6007604562737643
xwinograd_pt,stand for,accuracy,0.49809885931558934
xwinograd_pt,underscore refer to,accuracy,0.6083650190114068
xwinograd_pt,median,accuracy,0.6007604562737643
xwinograd_zh,Replace,accuracy,0.7063492063492064
xwinograd_zh,True or False,accuracy,0.5515873015873016
xwinograd_zh,does underscore refer to,accuracy,0.621031746031746
xwinograd_zh,stand for,accuracy,0.5158730158730159
xwinograd_zh,underscore refer to,accuracy,0.6765873015873016
xwinograd_zh,median,accuracy,0.621031746031746
multiple,average,multiple,0.6665267892901372