Muennighoff's picture
Organize eval
8634ce5
raw
history blame contribute delete
No virus
10.5 kB
dataset,prompt,metric,value
anli_dev_r1,GPT-3 style,accuracy,0.351
anli_dev_r1,MNLI crowdsource,accuracy,0.334
anli_dev_r1,can we infer,accuracy,0.351
anli_dev_r1,guaranteed/possible/impossible,accuracy,0.288
anli_dev_r1,justified in saying,accuracy,0.345
anli_dev_r1,median,accuracy,0.345
anli_dev_r2,GPT-3 style,accuracy,0.339
anli_dev_r2,MNLI crowdsource,accuracy,0.335
anli_dev_r2,can we infer,accuracy,0.354
anli_dev_r2,guaranteed/possible/impossible,accuracy,0.297
anli_dev_r2,justified in saying,accuracy,0.345
anli_dev_r2,median,accuracy,0.339
anli_dev_r3,GPT-3 style,accuracy,0.37583333333333335
anli_dev_r3,MNLI crowdsource,accuracy,0.3408333333333333
anli_dev_r3,can we infer,accuracy,0.36333333333333334
anli_dev_r3,guaranteed/possible/impossible,accuracy,0.31083333333333335
anli_dev_r3,justified in saying,accuracy,0.34
anli_dev_r3,median,accuracy,0.3408333333333333
story_cloze_2016,Answer Given options,accuracy,0.8305718866916088
story_cloze_2016,Choose Story Ending,accuracy,0.8706574024585783
story_cloze_2016,Generate Ending,accuracy,0.7183324425440941
story_cloze_2016,Novel Correct Ending,accuracy,0.848743987172635
story_cloze_2016,Story Continuation and Options,accuracy,0.8466060929983966
story_cloze_2016,median,accuracy,0.8466060929983966
super_glue_cb,GPT-3 style,accuracy,0.625
super_glue_cb,MNLI crowdsource,accuracy,0.08928571428571429
super_glue_cb,can we infer,accuracy,0.5892857142857143
super_glue_cb,guaranteed/possible/impossible,accuracy,0.5
super_glue_cb,justified in saying,accuracy,0.5357142857142857
super_glue_cb,median,accuracy,0.5357142857142857
super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.66
super_glue_copa,best_option,accuracy,0.67
super_glue_copa,cause_effect,accuracy,0.78
super_glue_copa,i_am_hesitating,accuracy,0.8
super_glue_copa,plausible_alternatives,accuracy,0.81
super_glue_copa,median,accuracy,0.78
super_glue_rte,GPT-3 style,accuracy,0.7870036101083032
super_glue_rte,MNLI crowdsource,accuracy,0.7220216606498195
super_glue_rte,does it follow that,accuracy,0.6678700361010831
super_glue_rte,guaranteed true,accuracy,0.6714801444043321
super_glue_rte,should assume,accuracy,0.6678700361010831
super_glue_rte,median,accuracy,0.6714801444043321
winogrande_winogrande_xl,Replace,accuracy,0.5406471981057617
winogrande_winogrande_xl,True or False,accuracy,0.5074980268350434
winogrande_winogrande_xl,does underscore refer to,accuracy,0.5177584846093133
winogrande_winogrande_xl,stand for,accuracy,0.510655090765588
winogrande_winogrande_xl,underscore refer to,accuracy,0.5256511444356748
winogrande_winogrande_xl,median,accuracy,0.5177584846093133
xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.47
xcopa_id,best_option,accuracy,0.51
xcopa_id,cause_effect,accuracy,0.65
xcopa_id,i_am_hesitating,accuracy,0.66
xcopa_id,plausible_alternatives,accuracy,0.67
xcopa_id,median,accuracy,0.65
xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.58
xcopa_sw,best_option,accuracy,0.57
xcopa_sw,cause_effect,accuracy,0.46
xcopa_sw,i_am_hesitating,accuracy,0.48
xcopa_sw,plausible_alternatives,accuracy,0.45
xcopa_sw,median,accuracy,0.48
xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.57
xcopa_ta,best_option,accuracy,0.67
xcopa_ta,cause_effect,accuracy,0.71
xcopa_ta,i_am_hesitating,accuracy,0.71
xcopa_ta,plausible_alternatives,accuracy,0.69
xcopa_ta,median,accuracy,0.69
xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.55
xcopa_vi,best_option,accuracy,0.61
xcopa_vi,cause_effect,accuracy,0.67
xcopa_vi,i_am_hesitating,accuracy,0.66
xcopa_vi,plausible_alternatives,accuracy,0.65
xcopa_vi,median,accuracy,0.65
xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.62
xcopa_zh,best_option,accuracy,0.61
xcopa_zh,cause_effect,accuracy,0.77
xcopa_zh,i_am_hesitating,accuracy,0.72
xcopa_zh,plausible_alternatives,accuracy,0.74
xcopa_zh,median,accuracy,0.72
xnli_ar,GPT-3 style,accuracy,0.5040160642570282
xnli_ar,MNLI crowdsource,accuracy,0.39879518072289155
xnli_ar,can we infer,accuracy,0.506425702811245
xnli_ar,guaranteed/possible/impossible,accuracy,0.4799196787148594
xnli_ar,justified in saying,accuracy,0.41526104417670684
xnli_ar,median,accuracy,0.4799196787148594
xnli_en,GPT-3 style,accuracy,0.5590361445783133
xnli_en,MNLI crowdsource,accuracy,0.342570281124498
xnli_en,can we infer,accuracy,0.5449799196787148
xnli_en,guaranteed/possible/impossible,accuracy,0.41164658634538154
xnli_en,justified in saying,accuracy,0.4634538152610442
xnli_en,median,accuracy,0.4634538152610442
xnli_es,GPT-3 style,accuracy,0.5373493975903615
xnli_es,MNLI crowdsource,accuracy,0.40441767068273093
xnli_es,can we infer,accuracy,0.5277108433734939
xnli_es,guaranteed/possible/impossible,accuracy,0.44216867469879517
xnli_es,justified in saying,accuracy,0.4534136546184739
xnli_es,median,accuracy,0.4534136546184739
xnli_fr,GPT-3 style,accuracy,0.5248995983935743
xnli_fr,MNLI crowdsource,accuracy,0.3895582329317269
xnli_fr,can we infer,accuracy,0.5337349397590362
xnli_fr,guaranteed/possible/impossible,accuracy,0.42971887550200805
xnli_fr,justified in saying,accuracy,0.4738955823293173
xnli_fr,median,accuracy,0.4738955823293173
xnli_hi,GPT-3 style,accuracy,0.4983935742971888
xnli_hi,MNLI crowdsource,accuracy,0.38714859437751004
xnli_hi,can we infer,accuracy,0.45542168674698796
xnli_hi,guaranteed/possible/impossible,accuracy,0.41405622489959837
xnli_hi,justified in saying,accuracy,0.38795180722891565
xnli_hi,median,accuracy,0.41405622489959837
xnli_sw,GPT-3 style,accuracy,0.43493975903614457
xnli_sw,MNLI crowdsource,accuracy,0.363855421686747
xnli_sw,can we infer,accuracy,0.42891566265060244
xnli_sw,guaranteed/possible/impossible,accuracy,0.3457831325301205
xnli_sw,justified in saying,accuracy,0.3650602409638554
xnli_sw,median,accuracy,0.3650602409638554
xnli_ur,GPT-3 style,accuracy,0.43493975903614457
xnli_ur,MNLI crowdsource,accuracy,0.3895582329317269
xnli_ur,can we infer,accuracy,0.45180722891566266
xnli_ur,guaranteed/possible/impossible,accuracy,0.40120481927710844
xnli_ur,justified in saying,accuracy,0.37630522088353413
xnli_ur,median,accuracy,0.40120481927710844
xnli_vi,GPT-3 style,accuracy,0.5196787148594377
xnli_vi,MNLI crowdsource,accuracy,0.38112449799196785
xnli_vi,can we infer,accuracy,0.5080321285140562
xnli_vi,guaranteed/possible/impossible,accuracy,0.38393574297188754
xnli_vi,justified in saying,accuracy,0.43614457831325304
xnli_vi,median,accuracy,0.43614457831325304
xnli_zh,GPT-3 style,accuracy,0.5052208835341365
xnli_zh,MNLI crowdsource,accuracy,0.4
xnli_zh,can we infer,accuracy,0.5228915662650603
xnli_zh,guaranteed/possible/impossible,accuracy,0.4738955823293173
xnli_zh,justified in saying,accuracy,0.45863453815261046
xnli_zh,median,accuracy,0.4738955823293173
xstory_cloze_ar,Answer Given options,accuracy,0.7518199867637326
xstory_cloze_ar,Choose Story Ending,accuracy,0.7749834546657842
xstory_cloze_ar,Generate Ending,accuracy,0.586366644606221
xstory_cloze_ar,Novel Correct Ending,accuracy,0.7518199867637326
xstory_cloze_ar,Story Continuation and Options,accuracy,0.7438782263401721
xstory_cloze_ar,median,accuracy,0.7518199867637326
xstory_cloze_es,Answer Given options,accuracy,0.7835870284579749
xstory_cloze_es,Choose Story Ending,accuracy,0.8292521508934481
xstory_cloze_es,Generate Ending,accuracy,0.6399735274652548
xstory_cloze_es,Novel Correct Ending,accuracy,0.7935142289874255
xstory_cloze_es,Story Continuation and Options,accuracy,0.7888815354070152
xstory_cloze_es,median,accuracy,0.7888815354070152
xstory_cloze_eu,Answer Given options,accuracy,0.7041694242223693
xstory_cloze_eu,Choose Story Ending,accuracy,0.6823295830575777
xstory_cloze_eu,Generate Ending,accuracy,0.5625413633355394
xstory_cloze_eu,Novel Correct Ending,accuracy,0.6671078755790867
xstory_cloze_eu,Story Continuation and Options,accuracy,0.671740569159497
xstory_cloze_eu,median,accuracy,0.671740569159497
xstory_cloze_hi,Answer Given options,accuracy,0.6915949702183984
xstory_cloze_hi,Choose Story Ending,accuracy,0.7220383851753805
xstory_cloze_hi,Generate Ending,accuracy,0.5883520847121112
xstory_cloze_hi,Novel Correct Ending,accuracy,0.6743878226340172
xstory_cloze_hi,Story Continuation and Options,accuracy,0.6816677696889477
xstory_cloze_hi,median,accuracy,0.6816677696889477
xstory_cloze_id,Answer Given options,accuracy,0.7445400397088021
xstory_cloze_id,Choose Story Ending,accuracy,0.771012574454004
xstory_cloze_id,Generate Ending,accuracy,0.6029119788219722
xstory_cloze_id,Novel Correct Ending,accuracy,0.7485109199205824
xstory_cloze_id,Story Continuation and Options,accuracy,0.7438782263401721
xstory_cloze_id,median,accuracy,0.7445400397088021
xstory_cloze_zh,Answer Given options,accuracy,0.7610853739245532
xstory_cloze_zh,Choose Story Ending,accuracy,0.7961614824619457
xstory_cloze_zh,Generate Ending,accuracy,0.6214427531436135
xstory_cloze_zh,Novel Correct Ending,accuracy,0.7696889477167439
xstory_cloze_zh,Story Continuation and Options,accuracy,0.7670416942422237
xstory_cloze_zh,median,accuracy,0.7670416942422237
xwinograd_en,Replace,accuracy,0.5225806451612903
xwinograd_en,True or False,accuracy,0.48946236559139783
xwinograd_en,does underscore refer to,accuracy,0.5281720430107527
xwinograd_en,stand for,accuracy,0.5062365591397849
xwinograd_en,underscore refer to,accuracy,0.5372043010752688
xwinograd_en,median,accuracy,0.5225806451612903
xwinograd_fr,Replace,accuracy,0.5060240963855421
xwinograd_fr,True or False,accuracy,0.5421686746987951
xwinograd_fr,does underscore refer to,accuracy,0.5542168674698795
xwinograd_fr,stand for,accuracy,0.4819277108433735
xwinograd_fr,underscore refer to,accuracy,0.5301204819277109
xwinograd_fr,median,accuracy,0.5301204819277109
xwinograd_pt,Replace,accuracy,0.5133079847908745
xwinograd_pt,True or False,accuracy,0.4714828897338403
xwinograd_pt,does underscore refer to,accuracy,0.5209125475285171
xwinograd_pt,stand for,accuracy,0.5019011406844106
xwinograd_pt,underscore refer to,accuracy,0.5399239543726235
xwinograd_pt,median,accuracy,0.5133079847908745
xwinograd_zh,Replace,accuracy,0.5257936507936508
xwinograd_zh,True or False,accuracy,0.5297619047619048
xwinograd_zh,does underscore refer to,accuracy,0.5218253968253969
xwinograd_zh,stand for,accuracy,0.4444444444444444
xwinograd_zh,underscore refer to,accuracy,0.5198412698412699
xwinograd_zh,median,accuracy,0.5218253968253969
multiple,average,multiple,0.5631550819200618