task,metric,value,err,version anli_r1,acc,0.324,0.01480686473373886,0 anli_r2,acc,0.361,0.015195720118175124,0 anli_r3,acc,0.3641666666666667,0.013896714966807262,0 arc_challenge,acc,0.2764505119453925,0.013069662474252425,0 arc_challenge,acc_norm,0.2935153583617747,0.01330725044494112,0 arc_easy,acc,0.6001683501683501,0.010051788039412911,0 arc_easy,acc_norm,0.5787037037037037,0.010131882498193126,0 boolq,acc,0.5868501529051988,0.008612117547803578,1 cb,acc,0.42857142857142855,0.06672848092813058,1 cb,f1,0.24317460317460324,,1 copa,acc,0.75,0.04351941398892446,0 hellaswag,acc,0.45488946425014937,0.004969431900874299,0 hellaswag,acc_norm,0.5929097789285003,0.004902878806733035,0 piqa,acc,0.7377584330794341,0.010262502565172445,0 piqa,acc_norm,0.749727965179543,0.010106561880089775,0 rte,acc,0.44765342960288806,0.029931070362939526,0 sciq,acc,0.892,0.009820001651345693,0 sciq,acc_norm,0.872,0.010570133761108654,0 storycloze_2016,acc,0.7006948156066275,0.010590117252248798,0 winogrande,acc,0.5627466456195738,0.013941393310695917,0