{ "results": { "hendrycksTest-high_school_world_history": { "acc": 0.6962025316455697, "acc_stderr": 0.029936696387138598, "acc_norm": 0.569620253164557, "acc_norm_stderr": 0.032230171959375976 }, "hendrycksTest-formal_logic": { "acc": 0.42063492063492064, "acc_stderr": 0.04415438226743743, "acc_norm": 0.3968253968253968, "acc_norm_stderr": 0.043758884927270605 }, "hendrycksTest-human_aging": { "acc": 0.672645739910314, "acc_stderr": 0.03149384670994131, "acc_norm": 0.3632286995515695, "acc_norm_stderr": 0.032277904428505 }, "hendrycksTest-international_law": { "acc": 0.7024793388429752, "acc_stderr": 0.04173349148083499, "acc_norm": 0.768595041322314, "acc_norm_stderr": 0.03849856098794088 }, "hendrycksTest-security_studies": { "acc": 0.5714285714285714, "acc_stderr": 0.031680911612338825, "acc_norm": 0.40408163265306124, "acc_norm_stderr": 0.0314147080258659 }, "hendrycksTest-medical_genetics": { "acc": 0.6, "acc_stderr": 0.049236596391733084, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620332 }, "hendrycksTest-econometrics": { "acc": 0.3508771929824561, "acc_stderr": 0.044895393502707, "acc_norm": 0.3157894736842105, "acc_norm_stderr": 0.043727482902780064 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.5153846153846153, "acc_stderr": 0.025339003010106515, "acc_norm": 0.4153846153846154, "acc_norm_stderr": 0.024985354923102332 }, "hendrycksTest-us_foreign_policy": { "acc": 0.79, "acc_stderr": 0.040936018074033256, "acc_norm": 0.59, "acc_norm_stderr": 0.049431107042371025 }, "hendrycksTest-logical_fallacies": { "acc": 0.6993865030674846, "acc_stderr": 0.03602511318806771, "acc_norm": 0.5398773006134969, "acc_norm_stderr": 0.039158572914369714 }, "hendrycksTest-prehistory": { "acc": 0.6635802469135802, "acc_stderr": 0.026289734945952926, "acc_norm": 0.42901234567901236, "acc_norm_stderr": 0.027538925613470867 }, "hendrycksTest-professional_psychology": { "acc": 0.5882352941176471, "acc_stderr": 0.019910377463105932, "acc_norm": 0.43300653594771243, "acc_norm_stderr": 0.02004544247332422 }, "hendrycksTest-professional_accounting": { "acc": 0.3971631205673759, "acc_stderr": 0.029189805673587105, "acc_norm": 0.33687943262411346, "acc_norm_stderr": 0.02819553487396673 }, "hendrycksTest-college_biology": { "acc": 0.6111111111111112, "acc_stderr": 0.04076663253918567, "acc_norm": 0.4236111111111111, "acc_norm_stderr": 0.04132125019723369 }, "hendrycksTest-high_school_biology": { "acc": 0.6709677419354839, "acc_stderr": 0.02672949906834996, "acc_norm": 0.5451612903225806, "acc_norm_stderr": 0.028327743091561074 }, "hendrycksTest-philosophy": { "acc": 0.6752411575562701, "acc_stderr": 0.02659678228769704, "acc_norm": 0.5016077170418006, "acc_norm_stderr": 0.02839794490780661 }, "hendrycksTest-high_school_european_history": { "acc": 0.696969696969697, "acc_stderr": 0.03588624800091707, "acc_norm": 0.5636363636363636, "acc_norm_stderr": 0.03872592983524754 }, "hendrycksTest-college_medicine": { "acc": 0.5144508670520231, "acc_stderr": 0.03810871630454764, "acc_norm": 0.43352601156069365, "acc_norm_stderr": 0.03778621079092055 }, "hendrycksTest-professional_medicine": { "acc": 0.5551470588235294, "acc_stderr": 0.03018753206032938, "acc_norm": 0.35661764705882354, "acc_norm_stderr": 0.02909720956841195 }, "hendrycksTest-moral_scenarios": { "acc": 0.34301675977653634, "acc_stderr": 0.015876912673057724, "acc_norm": 0.27262569832402234, "acc_norm_stderr": 0.014893391735249588 }, "hendrycksTest-high_school_chemistry": { "acc": 0.39901477832512317, "acc_stderr": 0.03445487686264716, "acc_norm": 0.3694581280788177, "acc_norm_stderr": 0.03395970381998573 }, "hendrycksTest-high_school_physics": { "acc": 0.31788079470198677, "acc_stderr": 0.038020397601079024, "acc_norm": 0.31125827814569534, "acc_norm_stderr": 0.03780445850526733 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.8082901554404145, "acc_stderr": 0.028408953626245282, "acc_norm": 0.6113989637305699, "acc_norm_stderr": 0.03517739796373132 }, "hendrycksTest-high_school_geography": { "acc": 0.7575757575757576, "acc_stderr": 0.030532892233932026, "acc_norm": 0.5505050505050505, "acc_norm_stderr": 0.0354413249194797 }, "hendrycksTest-global_facts": { "acc": 0.47, "acc_stderr": 0.05016135580465919, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "hendrycksTest-professional_law": { "acc": 0.4002607561929596, "acc_stderr": 0.012513582529136213, "acc_norm": 0.3435462842242503, "acc_norm_stderr": 0.012128961174190158 }, "hendrycksTest-college_mathematics": { "acc": 0.37, "acc_stderr": 0.048523658709391, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "hendrycksTest-college_physics": { "acc": 0.23529411764705882, "acc_stderr": 0.04220773659171452, "acc_norm": 0.29411764705882354, "acc_norm_stderr": 0.04533838195929774 }, "hendrycksTest-high_school_statistics": { "acc": 0.4351851851851852, "acc_stderr": 0.03381200005643525, "acc_norm": 0.35648148148148145, "acc_norm_stderr": 0.032664783315272714 }, "hendrycksTest-machine_learning": { "acc": 0.4017857142857143, "acc_stderr": 0.04653333146973646, "acc_norm": 0.30357142857142855, "acc_norm_stderr": 0.04364226155841044 }, "hendrycksTest-public_relations": { "acc": 0.6454545454545455, "acc_stderr": 0.045820048415054174, "acc_norm": 0.4090909090909091, "acc_norm_stderr": 0.047093069786618966 }, "hendrycksTest-high_school_computer_science": { "acc": 0.61, "acc_stderr": 0.04902071300001974, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919 }, "hendrycksTest-high_school_psychology": { "acc": 0.7706422018348624, "acc_stderr": 0.018025349724618684, "acc_norm": 0.5541284403669725, "acc_norm_stderr": 0.021311335009708582 }, "hendrycksTest-virology": { "acc": 0.4939759036144578, "acc_stderr": 0.03892212195333045, "acc_norm": 0.3433734939759036, "acc_norm_stderr": 0.03696584317010601 }, "hendrycksTest-marketing": { "acc": 0.8461538461538461, "acc_stderr": 0.023636873317489294, "acc_norm": 0.7649572649572649, "acc_norm_stderr": 0.027778835904935437 }, "hendrycksTest-human_sexuality": { "acc": 0.7022900763358778, "acc_stderr": 0.04010358942462203, "acc_norm": 0.46564885496183206, "acc_norm_stderr": 0.04374928560599738 }, "hendrycksTest-sociology": { "acc": 0.7611940298507462, "acc_stderr": 0.03014777593540922, "acc_norm": 0.6616915422885572, "acc_norm_stderr": 0.033455630703391914 }, "hendrycksTest-college_computer_science": { "acc": 0.43, "acc_stderr": 0.049756985195624284, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "hendrycksTest-conceptual_physics": { "acc": 0.5106382978723404, "acc_stderr": 0.03267862331014063, "acc_norm": 0.3276595744680851, "acc_norm_stderr": 0.030683020843231004 }, "hendrycksTest-anatomy": { "acc": 0.5185185185185185, "acc_stderr": 0.043163785995113245, "acc_norm": 0.4074074074074074, "acc_norm_stderr": 0.04244633238353228 }, "hendrycksTest-miscellaneous": { "acc": 0.8186462324393359, "acc_stderr": 0.013778693778464062, "acc_norm": 0.6143039591315453, "acc_norm_stderr": 0.017406476619212907 }, "hendrycksTest-jurisprudence": { "acc": 0.6666666666666666, "acc_stderr": 0.04557239513497751, "acc_norm": 0.5555555555555556, "acc_norm_stderr": 0.04803752235190193 }, "hendrycksTest-moral_disputes": { "acc": 0.6184971098265896, "acc_stderr": 0.026152198619726792, "acc_norm": 0.4595375722543353, "acc_norm_stderr": 0.026830805998952236 }, "hendrycksTest-high_school_us_history": { "acc": 0.7205882352941176, "acc_stderr": 0.031493281045079556, "acc_norm": 0.553921568627451, "acc_norm_stderr": 0.03488845451304974 }, "hendrycksTest-high_school_mathematics": { "acc": 0.25925925925925924, "acc_stderr": 0.026719240783712177, "acc_norm": 0.3148148148148148, "acc_norm_stderr": 0.02831753349606648 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.5840336134453782, "acc_stderr": 0.032016501007396114, "acc_norm": 0.4831932773109244, "acc_norm_stderr": 0.03246013680375308 }, "hendrycksTest-astronomy": { "acc": 0.5723684210526315, "acc_stderr": 0.04026097083296564, "acc_norm": 0.5657894736842105, "acc_norm_stderr": 0.04033565667848319 }, "hendrycksTest-world_religions": { "acc": 0.8128654970760234, "acc_stderr": 0.029913127232368043, "acc_norm": 0.7660818713450293, "acc_norm_stderr": 0.03246721765117825 }, "hendrycksTest-clinical_knowledge": { "acc": 0.5320754716981132, "acc_stderr": 0.03070948699255654, "acc_norm": 0.4641509433962264, "acc_norm_stderr": 0.030693675018458003 }, "hendrycksTest-college_chemistry": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "hendrycksTest-abstract_algebra": { "acc": 0.26, "acc_stderr": 0.04408440022768078, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "hendrycksTest-business_ethics": { "acc": 0.67, "acc_stderr": 0.04725815626252609, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795 }, "hendrycksTest-elementary_mathematics": { "acc": 0.4417989417989418, "acc_stderr": 0.02557625706125384, "acc_norm": 0.37037037037037035, "acc_norm_stderr": 0.024870815251057075 }, "hendrycksTest-management": { "acc": 0.7184466019417476, "acc_stderr": 0.044532548363264673, "acc_norm": 0.5533980582524272, "acc_norm_stderr": 0.04922424153458933 }, "hendrycksTest-electrical_engineering": { "acc": 0.5172413793103449, "acc_stderr": 0.04164188720169375, "acc_norm": 0.38620689655172413, "acc_norm_stderr": 0.040573247344190336 }, "hendrycksTest-nutrition": { "acc": 0.6111111111111112, "acc_stderr": 0.02791405551046801, "acc_norm": 0.5032679738562091, "acc_norm_stderr": 0.028629305194003543 }, "hendrycksTest-computer_security": { "acc": 0.66, "acc_stderr": 0.04760952285695237, "acc_norm": 0.58, "acc_norm_stderr": 0.049604496374885836 } }, "versions": { "hendrycksTest-high_school_world_history": 0, "hendrycksTest-formal_logic": 0, "hendrycksTest-human_aging": 0, "hendrycksTest-international_law": 0, "hendrycksTest-security_studies": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-econometrics": 0, "hendrycksTest-high_school_macroeconomics": 0, "hendrycksTest-us_foreign_policy": 0, "hendrycksTest-logical_fallacies": 0, "hendrycksTest-prehistory": 0, "hendrycksTest-professional_psychology": 0, "hendrycksTest-professional_accounting": 0, "hendrycksTest-college_biology": 0, "hendrycksTest-high_school_biology": 0, "hendrycksTest-philosophy": 0, "hendrycksTest-high_school_european_history": 0, "hendrycksTest-college_medicine": 0, "hendrycksTest-professional_medicine": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-high_school_chemistry": 0, "hendrycksTest-high_school_physics": 0, "hendrycksTest-high_school_government_and_politics": 0, "hendrycksTest-high_school_geography": 0, "hendrycksTest-global_facts": 0, "hendrycksTest-professional_law": 0, "hendrycksTest-college_mathematics": 0, "hendrycksTest-college_physics": 0, "hendrycksTest-high_school_statistics": 0, "hendrycksTest-machine_learning": 0, "hendrycksTest-public_relations": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-high_school_psychology": 0, "hendrycksTest-virology": 0, "hendrycksTest-marketing": 0, "hendrycksTest-human_sexuality": 0, "hendrycksTest-sociology": 0, "hendrycksTest-college_computer_science": 0, "hendrycksTest-conceptual_physics": 0, "hendrycksTest-anatomy": 0, "hendrycksTest-miscellaneous": 0, "hendrycksTest-jurisprudence": 0, "hendrycksTest-moral_disputes": 0, "hendrycksTest-high_school_us_history": 0, "hendrycksTest-high_school_mathematics": 0, "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-astronomy": 0, "hendrycksTest-world_religions": 0, "hendrycksTest-clinical_knowledge": 0, "hendrycksTest-college_chemistry": 0, "hendrycksTest-abstract_algebra": 0, "hendrycksTest-business_ethics": 0, "hendrycksTest-elementary_mathematics": 0, "hendrycksTest-management": 0, "hendrycksTest-electrical_engineering": 0, "hendrycksTest-nutrition": 0, "hendrycksTest-computer_security": 0 }, "config": { "model": "hf-causal-experimental", "model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True", "num_fewshot": 5, "batch_size": "auto", "device": "cuda:0", "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }