{ "results": { "hendrycksTest-abstract_algebra": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "hendrycksTest-anatomy": { "acc": 0.4666666666666667, "acc_stderr": 0.043097329010363554, "acc_norm": 0.4666666666666667, "acc_norm_stderr": 0.043097329010363554 }, "hendrycksTest-astronomy": { "acc": 0.42105263157894735, "acc_stderr": 0.040179012759817494, "acc_norm": 0.42105263157894735, "acc_norm_stderr": 0.040179012759817494 }, "hendrycksTest-business_ethics": { "acc": 0.46, "acc_stderr": 0.05009082659620332, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332 }, "hendrycksTest-clinical_knowledge": { "acc": 0.39245283018867927, "acc_stderr": 0.030052580579557845, "acc_norm": 0.39245283018867927, "acc_norm_stderr": 0.030052580579557845 }, "hendrycksTest-college_biology": { "acc": 0.3888888888888889, "acc_stderr": 0.04076663253918567, "acc_norm": 0.3888888888888889, "acc_norm_stderr": 0.04076663253918567 }, "hendrycksTest-college_chemistry": { "acc": 0.34, "acc_stderr": 0.047609522856952344, "acc_norm": 0.34, "acc_norm_stderr": 0.047609522856952344 }, "hendrycksTest-college_computer_science": { "acc": 0.4, "acc_stderr": 0.049236596391733084, "acc_norm": 0.4, "acc_norm_stderr": 0.049236596391733084 }, "hendrycksTest-college_mathematics": { "acc": 0.3, "acc_stderr": 0.04605661864718381, "acc_norm": 0.3, "acc_norm_stderr": 0.04605661864718381 }, "hendrycksTest-college_medicine": { "acc": 0.34104046242774566, "acc_stderr": 0.036146654241808254, "acc_norm": 0.34104046242774566, "acc_norm_stderr": 0.036146654241808254 }, "hendrycksTest-college_physics": { "acc": 0.19607843137254902, "acc_stderr": 0.03950581861179964, "acc_norm": 0.19607843137254902, "acc_norm_stderr": 0.03950581861179964 }, "hendrycksTest-computer_security": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "hendrycksTest-conceptual_physics": { "acc": 0.3872340425531915, "acc_stderr": 0.03184389265339526, "acc_norm": 0.3872340425531915, "acc_norm_stderr": 0.03184389265339526 }, "hendrycksTest-econometrics": { "acc": 0.2719298245614035, "acc_stderr": 0.04185774424022056, "acc_norm": 0.2719298245614035, "acc_norm_stderr": 0.04185774424022056 }, "hendrycksTest-electrical_engineering": { "acc": 0.4, "acc_stderr": 0.040824829046386284, "acc_norm": 0.4, "acc_norm_stderr": 0.040824829046386284 }, "hendrycksTest-elementary_mathematics": { "acc": 0.2671957671957672, "acc_stderr": 0.022789673145776568, "acc_norm": 0.2671957671957672, "acc_norm_stderr": 0.022789673145776568 }, "hendrycksTest-formal_logic": { "acc": 0.30158730158730157, "acc_stderr": 0.04104947269903394, "acc_norm": 0.30158730158730157, "acc_norm_stderr": 0.04104947269903394 }, "hendrycksTest-global_facts": { "acc": 0.27, "acc_stderr": 0.044619604333847394, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394 }, "hendrycksTest-high_school_biology": { "acc": 0.4290322580645161, "acc_stderr": 0.02815603653823321, "acc_norm": 0.4290322580645161, "acc_norm_stderr": 0.02815603653823321 }, "hendrycksTest-high_school_chemistry": { "acc": 0.28078817733990147, "acc_stderr": 0.03161856335358608, "acc_norm": 0.28078817733990147, "acc_norm_stderr": 0.03161856335358608 }, "hendrycksTest-high_school_computer_science": { "acc": 0.4, "acc_stderr": 0.049236596391733084, "acc_norm": 0.4, "acc_norm_stderr": 0.049236596391733084 }, "hendrycksTest-high_school_european_history": { "acc": 0.503030303030303, "acc_stderr": 0.03904272341431857, "acc_norm": 0.503030303030303, "acc_norm_stderr": 0.03904272341431857 }, "hendrycksTest-high_school_geography": { "acc": 0.43434343434343436, "acc_stderr": 0.03531505879359183, "acc_norm": 0.43434343434343436, "acc_norm_stderr": 0.03531505879359183 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.5958549222797928, "acc_stderr": 0.0354150857888402, "acc_norm": 0.5958549222797928, "acc_norm_stderr": 0.0354150857888402 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.3769230769230769, "acc_stderr": 0.024570975364225995, "acc_norm": 0.3769230769230769, "acc_norm_stderr": 0.024570975364225995 }, "hendrycksTest-high_school_mathematics": { "acc": 0.2740740740740741, "acc_stderr": 0.027195934804085622, "acc_norm": 0.2740740740740741, "acc_norm_stderr": 0.027195934804085622 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.31932773109243695, "acc_stderr": 0.0302839955258844, "acc_norm": 0.31932773109243695, "acc_norm_stderr": 0.0302839955258844 }, "hendrycksTest-high_school_physics": { "acc": 0.2781456953642384, "acc_stderr": 0.03658603262763743, "acc_norm": 0.2781456953642384, "acc_norm_stderr": 0.03658603262763743 }, "hendrycksTest-high_school_psychology": { "acc": 0.5247706422018349, "acc_stderr": 0.021410999753635914, "acc_norm": 0.5247706422018349, "acc_norm_stderr": 0.021410999753635914 }, "hendrycksTest-high_school_statistics": { "acc": 0.3194444444444444, "acc_stderr": 0.031798763421768524, "acc_norm": 0.3194444444444444, "acc_norm_stderr": 0.031798763421768524 }, "hendrycksTest-high_school_us_history": { "acc": 0.44607843137254904, "acc_stderr": 0.034888454513049734, "acc_norm": 0.44607843137254904, "acc_norm_stderr": 0.034888454513049734 }, "hendrycksTest-high_school_world_history": { "acc": 0.5611814345991561, "acc_stderr": 0.032302649315470375, "acc_norm": 0.5611814345991561, "acc_norm_stderr": 0.032302649315470375 }, "hendrycksTest-human_aging": { "acc": 0.43946188340807174, "acc_stderr": 0.03331092511038179, "acc_norm": 0.43946188340807174, "acc_norm_stderr": 0.03331092511038179 }, "hendrycksTest-human_sexuality": { "acc": 0.44274809160305345, "acc_stderr": 0.04356447202665069, "acc_norm": 0.44274809160305345, "acc_norm_stderr": 0.04356447202665069 }, "hendrycksTest-international_law": { "acc": 0.6115702479338843, "acc_stderr": 0.04449270350068382, "acc_norm": 0.6115702479338843, "acc_norm_stderr": 0.04449270350068382 }, "hendrycksTest-jurisprudence": { "acc": 0.4166666666666667, "acc_stderr": 0.04766075165356461, "acc_norm": 0.4166666666666667, "acc_norm_stderr": 0.04766075165356461 }, "hendrycksTest-logical_fallacies": { "acc": 0.4539877300613497, "acc_stderr": 0.0391170190467718, "acc_norm": 0.4539877300613497, "acc_norm_stderr": 0.0391170190467718 }, "hendrycksTest-machine_learning": { "acc": 0.32142857142857145, "acc_stderr": 0.044328040552915185, "acc_norm": 0.32142857142857145, "acc_norm_stderr": 0.044328040552915185 }, "hendrycksTest-management": { "acc": 0.36893203883495146, "acc_stderr": 0.04777615181156739, "acc_norm": 0.36893203883495146, "acc_norm_stderr": 0.04777615181156739 }, "hendrycksTest-marketing": { "acc": 0.5769230769230769, "acc_stderr": 0.032366121762202014, "acc_norm": 0.5769230769230769, "acc_norm_stderr": 0.032366121762202014 }, "hendrycksTest-medical_genetics": { "acc": 0.45, "acc_stderr": 0.05, "acc_norm": 0.45, "acc_norm_stderr": 0.05 }, "hendrycksTest-miscellaneous": { "acc": 0.5823754789272031, "acc_stderr": 0.01763563732695151, "acc_norm": 0.5823754789272031, "acc_norm_stderr": 0.01763563732695151 }, "hendrycksTest-moral_disputes": { "acc": 0.4653179190751445, "acc_stderr": 0.02685425792825889, "acc_norm": 0.4653179190751445, "acc_norm_stderr": 0.02685425792825889 }, "hendrycksTest-moral_scenarios": { "acc": 0.26145251396648045, "acc_stderr": 0.014696599650364557, "acc_norm": 0.26145251396648045, "acc_norm_stderr": 0.014696599650364557 }, "hendrycksTest-nutrition": { "acc": 0.4215686274509804, "acc_stderr": 0.028275490156791434, "acc_norm": 0.4215686274509804, "acc_norm_stderr": 0.028275490156791434 }, "hendrycksTest-philosophy": { "acc": 0.5176848874598071, "acc_stderr": 0.02838032284907713, "acc_norm": 0.5176848874598071, "acc_norm_stderr": 0.02838032284907713 }, "hendrycksTest-prehistory": { "acc": 0.4783950617283951, "acc_stderr": 0.027794760105008746, "acc_norm": 0.4783950617283951, "acc_norm_stderr": 0.027794760105008746 }, "hendrycksTest-professional_accounting": { "acc": 0.32269503546099293, "acc_stderr": 0.02788913930053479, "acc_norm": 0.32269503546099293, "acc_norm_stderr": 0.02788913930053479 }, "hendrycksTest-professional_law": { "acc": 0.3344198174706649, "acc_stderr": 0.012049668983214934, "acc_norm": 0.3344198174706649, "acc_norm_stderr": 0.012049668983214934 }, "hendrycksTest-professional_medicine": { "acc": 0.4522058823529412, "acc_stderr": 0.03023375855159644, "acc_norm": 0.4522058823529412, "acc_norm_stderr": 0.03023375855159644 }, "hendrycksTest-professional_psychology": { "acc": 0.41013071895424835, "acc_stderr": 0.019898412717635917, "acc_norm": 0.41013071895424835, "acc_norm_stderr": 0.019898412717635917 }, "hendrycksTest-public_relations": { "acc": 0.4636363636363636, "acc_stderr": 0.047764491623961985, "acc_norm": 0.4636363636363636, "acc_norm_stderr": 0.047764491623961985 }, "hendrycksTest-security_studies": { "acc": 0.42857142857142855, "acc_stderr": 0.03168091161233882, "acc_norm": 0.42857142857142855, "acc_norm_stderr": 0.03168091161233882 }, "hendrycksTest-sociology": { "acc": 0.5024875621890548, "acc_stderr": 0.03535490150137289, "acc_norm": 0.5024875621890548, "acc_norm_stderr": 0.03535490150137289 }, "hendrycksTest-us_foreign_policy": { "acc": 0.6, "acc_stderr": 0.04923659639173309, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309 }, "hendrycksTest-virology": { "acc": 0.40963855421686746, "acc_stderr": 0.03828401115079023, "acc_norm": 0.40963855421686746, "acc_norm_stderr": 0.03828401115079023 }, "hendrycksTest-world_religions": { "acc": 0.6198830409356725, "acc_stderr": 0.037229657413855394, "acc_norm": 0.6198830409356725, "acc_norm_stderr": 0.037229657413855394 } }, "versions": { "hendrycksTest-abstract_algebra": 1, "hendrycksTest-anatomy": 1, "hendrycksTest-astronomy": 1, "hendrycksTest-business_ethics": 1, "hendrycksTest-clinical_knowledge": 1, "hendrycksTest-college_biology": 1, "hendrycksTest-college_chemistry": 1, "hendrycksTest-college_computer_science": 1, "hendrycksTest-college_mathematics": 1, "hendrycksTest-college_medicine": 1, "hendrycksTest-college_physics": 1, "hendrycksTest-computer_security": 1, "hendrycksTest-conceptual_physics": 1, "hendrycksTest-econometrics": 1, "hendrycksTest-electrical_engineering": 1, "hendrycksTest-elementary_mathematics": 1, "hendrycksTest-formal_logic": 1, "hendrycksTest-global_facts": 1, "hendrycksTest-high_school_biology": 1, "hendrycksTest-high_school_chemistry": 1, "hendrycksTest-high_school_computer_science": 1, "hendrycksTest-high_school_european_history": 1, "hendrycksTest-high_school_geography": 1, "hendrycksTest-high_school_government_and_politics": 1, "hendrycksTest-high_school_macroeconomics": 1, "hendrycksTest-high_school_mathematics": 1, "hendrycksTest-high_school_microeconomics": 1, "hendrycksTest-high_school_physics": 1, "hendrycksTest-high_school_psychology": 1, "hendrycksTest-high_school_statistics": 1, "hendrycksTest-high_school_us_history": 1, "hendrycksTest-high_school_world_history": 1, "hendrycksTest-human_aging": 1, "hendrycksTest-human_sexuality": 1, "hendrycksTest-international_law": 1, "hendrycksTest-jurisprudence": 1, "hendrycksTest-logical_fallacies": 1, "hendrycksTest-machine_learning": 1, "hendrycksTest-management": 1, "hendrycksTest-marketing": 1, "hendrycksTest-medical_genetics": 1, "hendrycksTest-miscellaneous": 1, "hendrycksTest-moral_disputes": 1, "hendrycksTest-moral_scenarios": 1, "hendrycksTest-nutrition": 1, "hendrycksTest-philosophy": 1, "hendrycksTest-prehistory": 1, "hendrycksTest-professional_accounting": 1, "hendrycksTest-professional_law": 1, "hendrycksTest-professional_medicine": 1, "hendrycksTest-professional_psychology": 1, "hendrycksTest-public_relations": 1, "hendrycksTest-security_studies": 1, "hendrycksTest-sociology": 1, "hendrycksTest-us_foreign_policy": 1, "hendrycksTest-virology": 1, "hendrycksTest-world_religions": 1 }, "config": { "model": "sparseml", "model_args": "pretrained=/network/alexandre/research/cerebras/llama2_7B_sparse50_45B_retrained/checkpoint,dtype=bfloat16", "num_fewshot": 5, "batch_size": "4", "batch_sizes": [], "device": "cuda:7", "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }