arc_challenge | acc | 0.38993174061433444 |
---|
acc_stderr | 0.014252959848892884 |
---|
acc_norm | 0.4308873720136519 |
---|
acc_norm_stderr | 0.014471133392642475 |
---|
|
---|
hellaswag | acc | 0.5513841864170484 |
---|
acc_stderr | 0.004963362085275556 |
---|
acc_norm | 0.7257518422624976 |
---|
acc_norm_stderr | 0.00445222854104355 |
---|
|
---|
hendrycksTest-abstract_algebra | acc | 0.23 |
---|
acc_stderr | 0.04229525846816506 |
---|
acc_norm | 0.23 |
---|
acc_norm_stderr | 0.04229525846816506 |
---|
|
---|
hendrycksTest-anatomy | acc | 0.2962962962962963 |
---|
acc_stderr | 0.03944624162501116 |
---|
acc_norm | 0.2962962962962963 |
---|
acc_norm_stderr | 0.03944624162501116 |
---|
|
---|
hendrycksTest-astronomy | acc | 0.32894736842105265 |
---|
acc_stderr | 0.03823428969926603 |
---|
acc_norm | 0.32894736842105265 |
---|
acc_norm_stderr | 0.03823428969926603 |
---|
|
---|
hendrycksTest-business_ethics | acc | 0.3 |
---|
acc_stderr | 0.046056618647183814 |
---|
acc_norm | 0.3 |
---|
acc_norm_stderr | 0.046056618647183814 |
---|
|
---|
hendrycksTest-clinical_knowledge | acc | 0.2641509433962264 |
---|
acc_stderr | 0.027134291628741713 |
---|
acc_norm | 0.2641509433962264 |
---|
acc_norm_stderr | 0.027134291628741713 |
---|
|
---|
hendrycksTest-college_biology | acc | 0.2569444444444444 |
---|
acc_stderr | 0.03653946969442099 |
---|
acc_norm | 0.2569444444444444 |
---|
acc_norm_stderr | 0.03653946969442099 |
---|
|
---|
hendrycksTest-college_chemistry | acc | 0.22 |
---|
acc_stderr | 0.041633319989322695 |
---|
acc_norm | 0.22 |
---|
acc_norm_stderr | 0.041633319989322695 |
---|
|
---|
hendrycksTest-college_computer_science | acc | 0.26 |
---|
acc_stderr | 0.0440844002276808 |
---|
acc_norm | 0.26 |
---|
acc_norm_stderr | 0.0440844002276808 |
---|
|
---|
hendrycksTest-college_mathematics | acc | 0.31 |
---|
acc_stderr | 0.04648231987117316 |
---|
acc_norm | 0.31 |
---|
acc_norm_stderr | 0.04648231987117316 |
---|
|
---|
hendrycksTest-college_medicine | acc | 0.23121387283236994 |
---|
acc_stderr | 0.032147373020294696 |
---|
acc_norm | 0.23121387283236994 |
---|
acc_norm_stderr | 0.032147373020294696 |
---|
|
---|
hendrycksTest-college_physics | acc | 0.27450980392156865 |
---|
acc_stderr | 0.04440521906179327 |
---|
acc_norm | 0.27450980392156865 |
---|
acc_norm_stderr | 0.04440521906179327 |
---|
|
---|
hendrycksTest-computer_security | acc | 0.36 |
---|
acc_stderr | 0.048241815132442176 |
---|
acc_norm | 0.36 |
---|
acc_norm_stderr | 0.048241815132442176 |
---|
|
---|
hendrycksTest-conceptual_physics | acc | 0.2765957446808511 |
---|
acc_stderr | 0.029241883869628837 |
---|
acc_norm | 0.2765957446808511 |
---|
acc_norm_stderr | 0.029241883869628837 |
---|
|
---|
hendrycksTest-econometrics | acc | 0.2631578947368421 |
---|
acc_stderr | 0.04142439719489363 |
---|
acc_norm | 0.2631578947368421 |
---|
acc_norm_stderr | 0.04142439719489363 |
---|
|
---|
hendrycksTest-electrical_engineering | acc | 0.20689655172413793 |
---|
acc_stderr | 0.03375672449560554 |
---|
acc_norm | 0.20689655172413793 |
---|
acc_norm_stderr | 0.03375672449560554 |
---|
|
---|
hendrycksTest-elementary_mathematics | acc | 0.2698412698412698 |
---|
acc_stderr | 0.022860838309232072 |
---|
acc_norm | 0.2698412698412698 |
---|
acc_norm_stderr | 0.022860838309232072 |
---|
|
---|
hendrycksTest-formal_logic | acc | 0.2619047619047619 |
---|
acc_stderr | 0.039325376803928704 |
---|
acc_norm | 0.2619047619047619 |
---|
acc_norm_stderr | 0.039325376803928704 |
---|
|
---|
hendrycksTest-global_facts | acc | 0.35 |
---|
acc_stderr | 0.047937248544110196 |
---|
acc_norm | 0.35 |
---|
acc_norm_stderr | 0.047937248544110196 |
---|
|
---|
hendrycksTest-high_school_biology | acc | 0.24193548387096775 |
---|
acc_stderr | 0.0243625996930311 |
---|
acc_norm | 0.24193548387096775 |
---|
acc_norm_stderr | 0.0243625996930311 |
---|
|
---|
hendrycksTest-high_school_chemistry | acc | 0.28078817733990147 |
---|
acc_stderr | 0.0316185633535861 |
---|
acc_norm | 0.28078817733990147 |
---|
acc_norm_stderr | 0.0316185633535861 |
---|
|
---|
hendrycksTest-high_school_computer_science | acc | 0.33 |
---|
acc_stderr | 0.04725815626252605 |
---|
acc_norm | 0.33 |
---|
acc_norm_stderr | 0.04725815626252605 |
---|
|
---|
hendrycksTest-high_school_european_history | acc | 0.296969696969697 |
---|
acc_stderr | 0.03567969772268048 |
---|
acc_norm | 0.296969696969697 |
---|
acc_norm_stderr | 0.03567969772268048 |
---|
|
---|
hendrycksTest-high_school_geography | acc | 0.2878787878787879 |
---|
acc_stderr | 0.03225883512300993 |
---|
acc_norm | 0.2878787878787879 |
---|
acc_norm_stderr | 0.03225883512300993 |
---|
|
---|
hendrycksTest-high_school_government_and_politics | acc | 0.2538860103626943 |
---|
acc_stderr | 0.0314102478056532 |
---|
acc_norm | 0.2538860103626943 |
---|
acc_norm_stderr | 0.0314102478056532 |
---|
|
---|
hendrycksTest-high_school_macroeconomics | acc | 0.2743589743589744 |
---|
acc_stderr | 0.022622765767493207 |
---|
acc_norm | 0.2743589743589744 |
---|
acc_norm_stderr | 0.022622765767493207 |
---|
|
---|
hendrycksTest-high_school_mathematics | acc | 0.26296296296296295 |
---|
acc_stderr | 0.026842057873833706 |
---|
acc_norm | 0.26296296296296295 |
---|
acc_norm_stderr | 0.026842057873833706 |
---|
|
---|
hendrycksTest-high_school_microeconomics | acc | 0.2647058823529412 |
---|
acc_stderr | 0.028657491285071977 |
---|
acc_norm | 0.2647058823529412 |
---|
acc_norm_stderr | 0.028657491285071977 |
---|
|
---|
hendrycksTest-high_school_physics | acc | 0.304635761589404 |
---|
acc_stderr | 0.03757949922943343 |
---|
acc_norm | 0.304635761589404 |
---|
acc_norm_stderr | 0.03757949922943343 |
---|
|
---|
hendrycksTest-high_school_psychology | acc | 0.28623853211009176 |
---|
acc_stderr | 0.019379436628919968 |
---|
acc_norm | 0.28623853211009176 |
---|
acc_norm_stderr | 0.019379436628919968 |
---|
|
---|
hendrycksTest-high_school_statistics | acc | 0.25462962962962965 |
---|
acc_stderr | 0.02971127586000535 |
---|
acc_norm | 0.25462962962962965 |
---|
acc_norm_stderr | 0.02971127586000535 |
---|
|
---|
hendrycksTest-high_school_us_history | acc | 0.23039215686274508 |
---|
acc_stderr | 0.029554292605695053 |
---|
acc_norm | 0.23039215686274508 |
---|
acc_norm_stderr | 0.029554292605695053 |
---|
|
---|
hendrycksTest-high_school_world_history | acc | 0.2869198312236287 |
---|
acc_stderr | 0.029443773022594693 |
---|
acc_norm | 0.2869198312236287 |
---|
acc_norm_stderr | 0.029443773022594693 |
---|
|
---|
hendrycksTest-human_aging | acc | 0.3811659192825112 |
---|
acc_stderr | 0.03259625118416827 |
---|
acc_norm | 0.3811659192825112 |
---|
acc_norm_stderr | 0.03259625118416827 |
---|
|
---|
hendrycksTest-human_sexuality | acc | 0.1984732824427481 |
---|
acc_stderr | 0.03498149385462472 |
---|
acc_norm | 0.1984732824427481 |
---|
acc_norm_stderr | 0.03498149385462472 |
---|
|
---|
hendrycksTest-international_law | acc | 0.3884297520661157 |
---|
acc_stderr | 0.04449270350068382 |
---|
acc_norm | 0.3884297520661157 |
---|
acc_norm_stderr | 0.04449270350068382 |
---|
|
---|
hendrycksTest-jurisprudence | acc | 0.23148148148148148 |
---|
acc_stderr | 0.04077494709252627 |
---|
acc_norm | 0.23148148148148148 |
---|
acc_norm_stderr | 0.04077494709252627 |
---|
|
---|
hendrycksTest-logical_fallacies | acc | 0.2331288343558282 |
---|
acc_stderr | 0.03322015795776741 |
---|
acc_norm | 0.2331288343558282 |
---|
acc_norm_stderr | 0.03322015795776741 |
---|
|
---|
hendrycksTest-machine_learning | acc | 0.21428571428571427 |
---|
acc_stderr | 0.03894641120044792 |
---|
acc_norm | 0.21428571428571427 |
---|
acc_norm_stderr | 0.03894641120044792 |
---|
|
---|
hendrycksTest-management | acc | 0.3300970873786408 |
---|
acc_stderr | 0.04656147110012352 |
---|
acc_norm | 0.3300970873786408 |
---|
acc_norm_stderr | 0.04656147110012352 |
---|
|
---|
hendrycksTest-marketing | acc | 0.2905982905982906 |
---|
acc_stderr | 0.029745048572674078 |
---|
acc_norm | 0.2905982905982906 |
---|
acc_norm_stderr | 0.029745048572674078 |
---|
|
---|
hendrycksTest-medical_genetics | acc | 0.29 |
---|
acc_stderr | 0.04560480215720684 |
---|
acc_norm | 0.29 |
---|
acc_norm_stderr | 0.04560480215720684 |
---|
|
---|
hendrycksTest-miscellaneous | acc | 0.31545338441890164 |
---|
acc_stderr | 0.016617501738763394 |
---|
acc_norm | 0.31545338441890164 |
---|
acc_norm_stderr | 0.016617501738763394 |
---|
|
---|
hendrycksTest-moral_disputes | acc | 0.2861271676300578 |
---|
acc_stderr | 0.02433214677913413 |
---|
acc_norm | 0.2861271676300578 |
---|
acc_norm_stderr | 0.02433214677913413 |
---|
|
---|
hendrycksTest-moral_scenarios | acc | 0.2122905027932961 |
---|
acc_stderr | 0.01367664468583173 |
---|
acc_norm | 0.2122905027932961 |
---|
acc_norm_stderr | 0.01367664468583173 |
---|
|
---|
hendrycksTest-nutrition | acc | 0.2875816993464052 |
---|
acc_stderr | 0.02591780611714716 |
---|
acc_norm | 0.2875816993464052 |
---|
acc_norm_stderr | 0.02591780611714716 |
---|
|
---|
hendrycksTest-philosophy | acc | 0.2765273311897106 |
---|
acc_stderr | 0.02540383297817961 |
---|
acc_norm | 0.2765273311897106 |
---|
acc_norm_stderr | 0.02540383297817961 |
---|
|
---|
hendrycksTest-prehistory | acc | 0.3117283950617284 |
---|
acc_stderr | 0.025773111169630446 |
---|
acc_norm | 0.3117283950617284 |
---|
acc_norm_stderr | 0.025773111169630446 |
---|
|
---|
hendrycksTest-professional_accounting | acc | 0.26595744680851063 |
---|
acc_stderr | 0.026358065698880592 |
---|
acc_norm | 0.26595744680851063 |
---|
acc_norm_stderr | 0.026358065698880592 |
---|
|
---|
hendrycksTest-professional_law | acc | 0.25684485006518903 |
---|
acc_stderr | 0.011158455853098832 |
---|
acc_norm | 0.25684485006518903 |
---|
acc_norm_stderr | 0.011158455853098832 |
---|
|
---|
hendrycksTest-professional_medicine | acc | 0.1801470588235294 |
---|
acc_stderr | 0.023345163616544855 |
---|
acc_norm | 0.1801470588235294 |
---|
acc_norm_stderr | 0.023345163616544855 |
---|
|
---|
hendrycksTest-professional_psychology | acc | 0.27941176470588236 |
---|
acc_stderr | 0.018152871051538802 |
---|
acc_norm | 0.27941176470588236 |
---|
acc_norm_stderr | 0.018152871051538802 |
---|
|
---|
hendrycksTest-public_relations | acc | 0.3090909090909091 |
---|
acc_stderr | 0.044262946482000985 |
---|
acc_norm | 0.3090909090909091 |
---|
acc_norm_stderr | 0.044262946482000985 |
---|
|
---|
hendrycksTest-security_studies | acc | 0.32653061224489793 |
---|
acc_stderr | 0.030021056238440313 |
---|
acc_norm | 0.32653061224489793 |
---|
acc_norm_stderr | 0.030021056238440313 |
---|
|
---|
hendrycksTest-sociology | acc | 0.25870646766169153 |
---|
acc_stderr | 0.030965903123573026 |
---|
acc_norm | 0.25870646766169153 |
---|
acc_norm_stderr | 0.030965903123573026 |
---|
|
---|
hendrycksTest-us_foreign_policy | acc | 0.32 |
---|
acc_stderr | 0.04688261722621504 |
---|
acc_norm | 0.32 |
---|
acc_norm_stderr | 0.04688261722621504 |
---|
|
---|
hendrycksTest-virology | acc | 0.30120481927710846 |
---|
acc_stderr | 0.0357160923005348 |
---|
acc_norm | 0.30120481927710846 |
---|
acc_norm_stderr | 0.0357160923005348 |
---|
|
---|
hendrycksTest-world_religions | acc | 0.32748538011695905 |
---|
acc_stderr | 0.035993357714560276 |
---|
acc_norm | 0.32748538011695905 |
---|
acc_norm_stderr | 0.035993357714560276 |
---|
|
---|
truthfulqa_mc | mc1 | 0.2423500611995104 |
---|
mc1_stderr | 0.01500067437357034 |
---|
mc2 | 0.3859757929597962 |
---|
mc2_stderr | 0.013898628036488968 |
---|
|
---|