{ "results": { "hendrycksTest-abstract_algebra": { "acc": 0.26, "acc_stderr": 0.044084400227680814, "acc_norm": 0.26, "acc_norm_stderr": 0.044084400227680814 }, "hendrycksTest-anatomy": { "acc": 0.23703703703703705, "acc_stderr": 0.03673731683969506, "acc_norm": 0.23703703703703705, "acc_norm_stderr": 0.03673731683969506 }, "hendrycksTest-astronomy": { "acc": 0.25, "acc_stderr": 0.03523807393012047, "acc_norm": 0.25, "acc_norm_stderr": 0.03523807393012047 }, "hendrycksTest-business_ethics": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-clinical_knowledge": { "acc": 0.22641509433962265, "acc_stderr": 0.02575755989310675, "acc_norm": 0.22641509433962265, "acc_norm_stderr": 0.02575755989310675 }, "hendrycksTest-college_biology": { "acc": 0.2708333333333333, "acc_stderr": 0.037161774375660185, "acc_norm": 0.2708333333333333, "acc_norm_stderr": 0.037161774375660185 }, "hendrycksTest-college_chemistry": { "acc": 0.28, "acc_stderr": 0.045126085985421276, "acc_norm": 0.28, "acc_norm_stderr": 0.045126085985421276 }, "hendrycksTest-college_computer_science": { "acc": 0.42, "acc_stderr": 0.04960449637488583, "acc_norm": 0.42, "acc_norm_stderr": 0.04960449637488583 }, "hendrycksTest-college_mathematics": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-college_medicine": { "acc": 0.2543352601156069, "acc_stderr": 0.0332055644308557, "acc_norm": 0.2543352601156069, "acc_norm_stderr": 0.0332055644308557 }, "hendrycksTest-college_physics": { "acc": 0.21568627450980393, "acc_stderr": 0.04092563958237656, "acc_norm": 0.21568627450980393, "acc_norm_stderr": 0.04092563958237656 }, "hendrycksTest-computer_security": { "acc": 0.32, "acc_stderr": 0.04688261722621504, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621504 }, "hendrycksTest-conceptual_physics": { "acc": 0.2978723404255319, "acc_stderr": 0.029896145682095462, "acc_norm": 0.2978723404255319, "acc_norm_stderr": 0.029896145682095462 }, "hendrycksTest-econometrics": { "acc": 0.2631578947368421, "acc_stderr": 0.04142439719489361, "acc_norm": 0.2631578947368421, "acc_norm_stderr": 0.04142439719489361 }, "hendrycksTest-electrical_engineering": { "acc": 0.2413793103448276, "acc_stderr": 0.03565998174135303, "acc_norm": 0.2413793103448276, "acc_norm_stderr": 0.03565998174135303 }, "hendrycksTest-elementary_mathematics": { "acc": 0.2724867724867725, "acc_stderr": 0.022930973071633356, "acc_norm": 0.2724867724867725, "acc_norm_stderr": 0.022930973071633356 }, "hendrycksTest-formal_logic": { "acc": 0.30952380952380953, "acc_stderr": 0.04134913018303316, "acc_norm": 0.30952380952380953, "acc_norm_stderr": 0.04134913018303316 }, "hendrycksTest-global_facts": { "acc": 0.28, "acc_stderr": 0.045126085985421276, "acc_norm": 0.28, "acc_norm_stderr": 0.045126085985421276 }, "hendrycksTest-high_school_biology": { "acc": 0.22258064516129034, "acc_stderr": 0.023664216671642528, "acc_norm": 0.22258064516129034, "acc_norm_stderr": 0.023664216671642528 }, "hendrycksTest-high_school_chemistry": { "acc": 0.18719211822660098, "acc_stderr": 0.027444924966882618, "acc_norm": 0.18719211822660098, "acc_norm_stderr": 0.027444924966882618 }, "hendrycksTest-high_school_computer_science": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "hendrycksTest-high_school_european_history": { "acc": 0.296969696969697, "acc_stderr": 0.035679697722680474, "acc_norm": 0.296969696969697, "acc_norm_stderr": 0.035679697722680474 }, "hendrycksTest-high_school_geography": { "acc": 0.19696969696969696, "acc_stderr": 0.028335609732463355, "acc_norm": 0.19696969696969696, "acc_norm_stderr": 0.028335609732463355 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.23316062176165803, "acc_stderr": 0.030516111371476008, "acc_norm": 0.23316062176165803, "acc_norm_stderr": 0.030516111371476008 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.21025641025641026, "acc_stderr": 0.020660597485026935, "acc_norm": 0.21025641025641026, "acc_norm_stderr": 0.020660597485026935 }, "hendrycksTest-high_school_mathematics": { "acc": 0.21851851851851853, "acc_stderr": 0.025195752251823786, "acc_norm": 0.21851851851851853, "acc_norm_stderr": 0.025195752251823786 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.27310924369747897, "acc_stderr": 0.02894200404099817, "acc_norm": 0.27310924369747897, "acc_norm_stderr": 0.02894200404099817 }, "hendrycksTest-high_school_physics": { "acc": 0.26490066225165565, "acc_stderr": 0.03603038545360384, "acc_norm": 0.26490066225165565, "acc_norm_stderr": 0.03603038545360384 }, "hendrycksTest-high_school_psychology": { "acc": 0.23119266055045873, "acc_stderr": 0.018075750241633156, "acc_norm": 0.23119266055045873, "acc_norm_stderr": 0.018075750241633156 }, "hendrycksTest-high_school_statistics": { "acc": 0.22685185185185186, "acc_stderr": 0.028561650102422256, "acc_norm": 0.22685185185185186, "acc_norm_stderr": 0.028561650102422256 }, "hendrycksTest-high_school_us_history": { "acc": 0.2696078431372549, "acc_stderr": 0.031145570659486782, "acc_norm": 0.2696078431372549, "acc_norm_stderr": 0.031145570659486782 }, "hendrycksTest-high_school_world_history": { "acc": 0.28270042194092826, "acc_stderr": 0.029312814153955917, "acc_norm": 0.28270042194092826, "acc_norm_stderr": 0.029312814153955917 }, "hendrycksTest-human_aging": { "acc": 0.32286995515695066, "acc_stderr": 0.03138147637575498, "acc_norm": 0.32286995515695066, "acc_norm_stderr": 0.03138147637575498 }, "hendrycksTest-human_sexuality": { "acc": 0.3435114503816794, "acc_stderr": 0.041649760719448786, "acc_norm": 0.3435114503816794, "acc_norm_stderr": 0.041649760719448786 }, "hendrycksTest-international_law": { "acc": 0.2727272727272727, "acc_stderr": 0.04065578140908705, "acc_norm": 0.2727272727272727, "acc_norm_stderr": 0.04065578140908705 }, "hendrycksTest-jurisprudence": { "acc": 0.26851851851851855, "acc_stderr": 0.04284467968052192, "acc_norm": 0.26851851851851855, "acc_norm_stderr": 0.04284467968052192 }, "hendrycksTest-logical_fallacies": { "acc": 0.2147239263803681, "acc_stderr": 0.03226219377286774, "acc_norm": 0.2147239263803681, "acc_norm_stderr": 0.03226219377286774 }, "hendrycksTest-machine_learning": { "acc": 0.25892857142857145, "acc_stderr": 0.04157751539865629, "acc_norm": 0.25892857142857145, "acc_norm_stderr": 0.04157751539865629 }, "hendrycksTest-management": { "acc": 0.18446601941747573, "acc_stderr": 0.03840423627288276, "acc_norm": 0.18446601941747573, "acc_norm_stderr": 0.03840423627288276 }, "hendrycksTest-marketing": { "acc": 0.3076923076923077, "acc_stderr": 0.030236389942173106, "acc_norm": 0.3076923076923077, "acc_norm_stderr": 0.030236389942173106 }, "hendrycksTest-medical_genetics": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "hendrycksTest-miscellaneous": { "acc": 0.26947637292464877, "acc_stderr": 0.015866243073215068, "acc_norm": 0.26947637292464877, "acc_norm_stderr": 0.015866243073215068 }, "hendrycksTest-moral_disputes": { "acc": 0.30346820809248554, "acc_stderr": 0.024752411960917212, "acc_norm": 0.30346820809248554, "acc_norm_stderr": 0.024752411960917212 }, "hendrycksTest-moral_scenarios": { "acc": 0.23798882681564246, "acc_stderr": 0.014242630070574915, "acc_norm": 0.23798882681564246, "acc_norm_stderr": 0.014242630070574915 }, "hendrycksTest-nutrition": { "acc": 0.2549019607843137, "acc_stderr": 0.024954184324879905, "acc_norm": 0.2549019607843137, "acc_norm_stderr": 0.024954184324879905 }, "hendrycksTest-philosophy": { "acc": 0.2990353697749196, "acc_stderr": 0.026003301117885142, "acc_norm": 0.2990353697749196, "acc_norm_stderr": 0.026003301117885142 }, "hendrycksTest-prehistory": { "acc": 0.3117283950617284, "acc_stderr": 0.02577311116963045, "acc_norm": 0.3117283950617284, "acc_norm_stderr": 0.02577311116963045 }, "hendrycksTest-professional_accounting": { "acc": 0.2624113475177305, "acc_stderr": 0.026244920349843007, "acc_norm": 0.2624113475177305, "acc_norm_stderr": 0.026244920349843007 }, "hendrycksTest-professional_law": { "acc": 0.2803129074315515, "acc_stderr": 0.01147155594495862, "acc_norm": 0.2803129074315515, "acc_norm_stderr": 0.01147155594495862 }, "hendrycksTest-professional_medicine": { "acc": 0.19852941176470587, "acc_stderr": 0.0242310133705411, "acc_norm": 0.19852941176470587, "acc_norm_stderr": 0.0242310133705411 }, "hendrycksTest-professional_psychology": { "acc": 0.3088235294117647, "acc_stderr": 0.01869085027359529, "acc_norm": 0.3088235294117647, "acc_norm_stderr": 0.01869085027359529 }, "hendrycksTest-public_relations": { "acc": 0.3181818181818182, "acc_stderr": 0.04461272175910508, "acc_norm": 0.3181818181818182, "acc_norm_stderr": 0.04461272175910508 }, "hendrycksTest-security_studies": { "acc": 0.21224489795918366, "acc_stderr": 0.026176967197866767, "acc_norm": 0.21224489795918366, "acc_norm_stderr": 0.026176967197866767 }, "hendrycksTest-sociology": { "acc": 0.2736318407960199, "acc_stderr": 0.031524391865554, "acc_norm": 0.2736318407960199, "acc_norm_stderr": 0.031524391865554 }, "hendrycksTest-us_foreign_policy": { "acc": 0.28, "acc_stderr": 0.04512608598542128, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542128 }, "hendrycksTest-virology": { "acc": 0.30120481927710846, "acc_stderr": 0.035716092300534796, "acc_norm": 0.30120481927710846, "acc_norm_stderr": 0.035716092300534796 }, "hendrycksTest-world_religions": { "acc": 0.38596491228070173, "acc_stderr": 0.03733756969066164, "acc_norm": 0.38596491228070173, "acc_norm_stderr": 0.03733756969066164 } }, "versions": { "hendrycksTest-abstract_algebra": 1, "hendrycksTest-anatomy": 1, "hendrycksTest-astronomy": 1, "hendrycksTest-business_ethics": 1, "hendrycksTest-clinical_knowledge": 1, "hendrycksTest-college_biology": 1, "hendrycksTest-college_chemistry": 1, "hendrycksTest-college_computer_science": 1, "hendrycksTest-college_mathematics": 1, "hendrycksTest-college_medicine": 1, "hendrycksTest-college_physics": 1, "hendrycksTest-computer_security": 1, "hendrycksTest-conceptual_physics": 1, "hendrycksTest-econometrics": 1, "hendrycksTest-electrical_engineering": 1, "hendrycksTest-elementary_mathematics": 1, "hendrycksTest-formal_logic": 1, "hendrycksTest-global_facts": 1, "hendrycksTest-high_school_biology": 1, "hendrycksTest-high_school_chemistry": 1, "hendrycksTest-high_school_computer_science": 1, "hendrycksTest-high_school_european_history": 1, "hendrycksTest-high_school_geography": 1, "hendrycksTest-high_school_government_and_politics": 1, "hendrycksTest-high_school_macroeconomics": 1, "hendrycksTest-high_school_mathematics": 1, "hendrycksTest-high_school_microeconomics": 1, "hendrycksTest-high_school_physics": 1, "hendrycksTest-high_school_psychology": 1, "hendrycksTest-high_school_statistics": 1, "hendrycksTest-high_school_us_history": 1, "hendrycksTest-high_school_world_history": 1, "hendrycksTest-human_aging": 1, "hendrycksTest-human_sexuality": 1, "hendrycksTest-international_law": 1, "hendrycksTest-jurisprudence": 1, "hendrycksTest-logical_fallacies": 1, "hendrycksTest-machine_learning": 1, "hendrycksTest-management": 1, "hendrycksTest-marketing": 1, "hendrycksTest-medical_genetics": 1, "hendrycksTest-miscellaneous": 1, "hendrycksTest-moral_disputes": 1, "hendrycksTest-moral_scenarios": 1, "hendrycksTest-nutrition": 1, "hendrycksTest-philosophy": 1, "hendrycksTest-prehistory": 1, "hendrycksTest-professional_accounting": 1, "hendrycksTest-professional_law": 1, "hendrycksTest-professional_medicine": 1, "hendrycksTest-professional_psychology": 1, "hendrycksTest-public_relations": 1, "hendrycksTest-security_studies": 1, "hendrycksTest-sociology": 1, "hendrycksTest-us_foreign_policy": 1, "hendrycksTest-virology": 1, "hendrycksTest-world_religions": 1 }, "config": { "model": "hf-causal", "model_args": "pretrained=workdir_7b/ckpt_349", "num_fewshot": 5, "batch_size": "8", "batch_sizes": [], "device": null, "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }