|
{ |
|
"results": { |
|
"hendrycksTest-abstract_algebra": { |
|
"acc": 0.26, |
|
"acc_stderr": 0.044084400227680814, |
|
"acc_norm": 0.26, |
|
"acc_norm_stderr": 0.044084400227680814 |
|
}, |
|
"hendrycksTest-anatomy": { |
|
"acc": 0.23703703703703705, |
|
"acc_stderr": 0.03673731683969506, |
|
"acc_norm": 0.23703703703703705, |
|
"acc_norm_stderr": 0.03673731683969506 |
|
}, |
|
"hendrycksTest-astronomy": { |
|
"acc": 0.25, |
|
"acc_stderr": 0.03523807393012047, |
|
"acc_norm": 0.25, |
|
"acc_norm_stderr": 0.03523807393012047 |
|
}, |
|
"hendrycksTest-business_ethics": { |
|
"acc": 0.31, |
|
"acc_stderr": 0.04648231987117316, |
|
"acc_norm": 0.31, |
|
"acc_norm_stderr": 0.04648231987117316 |
|
}, |
|
"hendrycksTest-clinical_knowledge": { |
|
"acc": 0.22641509433962265, |
|
"acc_stderr": 0.02575755989310675, |
|
"acc_norm": 0.22641509433962265, |
|
"acc_norm_stderr": 0.02575755989310675 |
|
}, |
|
"hendrycksTest-college_biology": { |
|
"acc": 0.2708333333333333, |
|
"acc_stderr": 0.037161774375660185, |
|
"acc_norm": 0.2708333333333333, |
|
"acc_norm_stderr": 0.037161774375660185 |
|
}, |
|
"hendrycksTest-college_chemistry": { |
|
"acc": 0.28, |
|
"acc_stderr": 0.045126085985421276, |
|
"acc_norm": 0.28, |
|
"acc_norm_stderr": 0.045126085985421276 |
|
}, |
|
"hendrycksTest-college_computer_science": { |
|
"acc": 0.42, |
|
"acc_stderr": 0.04960449637488583, |
|
"acc_norm": 0.42, |
|
"acc_norm_stderr": 0.04960449637488583 |
|
}, |
|
"hendrycksTest-college_mathematics": { |
|
"acc": 0.31, |
|
"acc_stderr": 0.04648231987117316, |
|
"acc_norm": 0.31, |
|
"acc_norm_stderr": 0.04648231987117316 |
|
}, |
|
"hendrycksTest-college_medicine": { |
|
"acc": 0.2543352601156069, |
|
"acc_stderr": 0.0332055644308557, |
|
"acc_norm": 0.2543352601156069, |
|
"acc_norm_stderr": 0.0332055644308557 |
|
}, |
|
"hendrycksTest-college_physics": { |
|
"acc": 0.21568627450980393, |
|
"acc_stderr": 0.04092563958237656, |
|
"acc_norm": 0.21568627450980393, |
|
"acc_norm_stderr": 0.04092563958237656 |
|
}, |
|
"hendrycksTest-computer_security": { |
|
"acc": 0.32, |
|
"acc_stderr": 0.04688261722621504, |
|
"acc_norm": 0.32, |
|
"acc_norm_stderr": 0.04688261722621504 |
|
}, |
|
"hendrycksTest-conceptual_physics": { |
|
"acc": 0.2978723404255319, |
|
"acc_stderr": 0.029896145682095462, |
|
"acc_norm": 0.2978723404255319, |
|
"acc_norm_stderr": 0.029896145682095462 |
|
}, |
|
"hendrycksTest-econometrics": { |
|
"acc": 0.2631578947368421, |
|
"acc_stderr": 0.04142439719489361, |
|
"acc_norm": 0.2631578947368421, |
|
"acc_norm_stderr": 0.04142439719489361 |
|
}, |
|
"hendrycksTest-electrical_engineering": { |
|
"acc": 0.2413793103448276, |
|
"acc_stderr": 0.03565998174135303, |
|
"acc_norm": 0.2413793103448276, |
|
"acc_norm_stderr": 0.03565998174135303 |
|
}, |
|
"hendrycksTest-elementary_mathematics": { |
|
"acc": 0.2724867724867725, |
|
"acc_stderr": 0.022930973071633356, |
|
"acc_norm": 0.2724867724867725, |
|
"acc_norm_stderr": 0.022930973071633356 |
|
}, |
|
"hendrycksTest-formal_logic": { |
|
"acc": 0.30952380952380953, |
|
"acc_stderr": 0.04134913018303316, |
|
"acc_norm": 0.30952380952380953, |
|
"acc_norm_stderr": 0.04134913018303316 |
|
}, |
|
"hendrycksTest-global_facts": { |
|
"acc": 0.28, |
|
"acc_stderr": 0.045126085985421276, |
|
"acc_norm": 0.28, |
|
"acc_norm_stderr": 0.045126085985421276 |
|
}, |
|
"hendrycksTest-high_school_biology": { |
|
"acc": 0.22258064516129034, |
|
"acc_stderr": 0.023664216671642528, |
|
"acc_norm": 0.22258064516129034, |
|
"acc_norm_stderr": 0.023664216671642528 |
|
}, |
|
"hendrycksTest-high_school_chemistry": { |
|
"acc": 0.18719211822660098, |
|
"acc_stderr": 0.027444924966882618, |
|
"acc_norm": 0.18719211822660098, |
|
"acc_norm_stderr": 0.027444924966882618 |
|
}, |
|
"hendrycksTest-high_school_computer_science": { |
|
"acc": 0.25, |
|
"acc_stderr": 0.04351941398892446, |
|
"acc_norm": 0.25, |
|
"acc_norm_stderr": 0.04351941398892446 |
|
}, |
|
"hendrycksTest-high_school_european_history": { |
|
"acc": 0.296969696969697, |
|
"acc_stderr": 0.035679697722680474, |
|
"acc_norm": 0.296969696969697, |
|
"acc_norm_stderr": 0.035679697722680474 |
|
}, |
|
"hendrycksTest-high_school_geography": { |
|
"acc": 0.19696969696969696, |
|
"acc_stderr": 0.028335609732463355, |
|
"acc_norm": 0.19696969696969696, |
|
"acc_norm_stderr": 0.028335609732463355 |
|
}, |
|
"hendrycksTest-high_school_government_and_politics": { |
|
"acc": 0.23316062176165803, |
|
"acc_stderr": 0.030516111371476008, |
|
"acc_norm": 0.23316062176165803, |
|
"acc_norm_stderr": 0.030516111371476008 |
|
}, |
|
"hendrycksTest-high_school_macroeconomics": { |
|
"acc": 0.21025641025641026, |
|
"acc_stderr": 0.020660597485026935, |
|
"acc_norm": 0.21025641025641026, |
|
"acc_norm_stderr": 0.020660597485026935 |
|
}, |
|
"hendrycksTest-high_school_mathematics": { |
|
"acc": 0.21851851851851853, |
|
"acc_stderr": 0.025195752251823786, |
|
"acc_norm": 0.21851851851851853, |
|
"acc_norm_stderr": 0.025195752251823786 |
|
}, |
|
"hendrycksTest-high_school_microeconomics": { |
|
"acc": 0.27310924369747897, |
|
"acc_stderr": 0.02894200404099817, |
|
"acc_norm": 0.27310924369747897, |
|
"acc_norm_stderr": 0.02894200404099817 |
|
}, |
|
"hendrycksTest-high_school_physics": { |
|
"acc": 0.26490066225165565, |
|
"acc_stderr": 0.03603038545360384, |
|
"acc_norm": 0.26490066225165565, |
|
"acc_norm_stderr": 0.03603038545360384 |
|
}, |
|
"hendrycksTest-high_school_psychology": { |
|
"acc": 0.23119266055045873, |
|
"acc_stderr": 0.018075750241633156, |
|
"acc_norm": 0.23119266055045873, |
|
"acc_norm_stderr": 0.018075750241633156 |
|
}, |
|
"hendrycksTest-high_school_statistics": { |
|
"acc": 0.22685185185185186, |
|
"acc_stderr": 0.028561650102422256, |
|
"acc_norm": 0.22685185185185186, |
|
"acc_norm_stderr": 0.028561650102422256 |
|
}, |
|
"hendrycksTest-high_school_us_history": { |
|
"acc": 0.2696078431372549, |
|
"acc_stderr": 0.031145570659486782, |
|
"acc_norm": 0.2696078431372549, |
|
"acc_norm_stderr": 0.031145570659486782 |
|
}, |
|
"hendrycksTest-high_school_world_history": { |
|
"acc": 0.28270042194092826, |
|
"acc_stderr": 0.029312814153955917, |
|
"acc_norm": 0.28270042194092826, |
|
"acc_norm_stderr": 0.029312814153955917 |
|
}, |
|
"hendrycksTest-human_aging": { |
|
"acc": 0.32286995515695066, |
|
"acc_stderr": 0.03138147637575498, |
|
"acc_norm": 0.32286995515695066, |
|
"acc_norm_stderr": 0.03138147637575498 |
|
}, |
|
"hendrycksTest-human_sexuality": { |
|
"acc": 0.3435114503816794, |
|
"acc_stderr": 0.041649760719448786, |
|
"acc_norm": 0.3435114503816794, |
|
"acc_norm_stderr": 0.041649760719448786 |
|
}, |
|
"hendrycksTest-international_law": { |
|
"acc": 0.2727272727272727, |
|
"acc_stderr": 0.04065578140908705, |
|
"acc_norm": 0.2727272727272727, |
|
"acc_norm_stderr": 0.04065578140908705 |
|
}, |
|
"hendrycksTest-jurisprudence": { |
|
"acc": 0.26851851851851855, |
|
"acc_stderr": 0.04284467968052192, |
|
"acc_norm": 0.26851851851851855, |
|
"acc_norm_stderr": 0.04284467968052192 |
|
}, |
|
"hendrycksTest-logical_fallacies": { |
|
"acc": 0.2147239263803681, |
|
"acc_stderr": 0.03226219377286774, |
|
"acc_norm": 0.2147239263803681, |
|
"acc_norm_stderr": 0.03226219377286774 |
|
}, |
|
"hendrycksTest-machine_learning": { |
|
"acc": 0.25892857142857145, |
|
"acc_stderr": 0.04157751539865629, |
|
"acc_norm": 0.25892857142857145, |
|
"acc_norm_stderr": 0.04157751539865629 |
|
}, |
|
"hendrycksTest-management": { |
|
"acc": 0.18446601941747573, |
|
"acc_stderr": 0.03840423627288276, |
|
"acc_norm": 0.18446601941747573, |
|
"acc_norm_stderr": 0.03840423627288276 |
|
}, |
|
"hendrycksTest-marketing": { |
|
"acc": 0.3076923076923077, |
|
"acc_stderr": 0.030236389942173106, |
|
"acc_norm": 0.3076923076923077, |
|
"acc_norm_stderr": 0.030236389942173106 |
|
}, |
|
"hendrycksTest-medical_genetics": { |
|
"acc": 0.44, |
|
"acc_stderr": 0.04988876515698589, |
|
"acc_norm": 0.44, |
|
"acc_norm_stderr": 0.04988876515698589 |
|
}, |
|
"hendrycksTest-miscellaneous": { |
|
"acc": 0.26947637292464877, |
|
"acc_stderr": 0.015866243073215068, |
|
"acc_norm": 0.26947637292464877, |
|
"acc_norm_stderr": 0.015866243073215068 |
|
}, |
|
"hendrycksTest-moral_disputes": { |
|
"acc": 0.30346820809248554, |
|
"acc_stderr": 0.024752411960917212, |
|
"acc_norm": 0.30346820809248554, |
|
"acc_norm_stderr": 0.024752411960917212 |
|
}, |
|
"hendrycksTest-moral_scenarios": { |
|
"acc": 0.23798882681564246, |
|
"acc_stderr": 0.014242630070574915, |
|
"acc_norm": 0.23798882681564246, |
|
"acc_norm_stderr": 0.014242630070574915 |
|
}, |
|
"hendrycksTest-nutrition": { |
|
"acc": 0.2549019607843137, |
|
"acc_stderr": 0.024954184324879905, |
|
"acc_norm": 0.2549019607843137, |
|
"acc_norm_stderr": 0.024954184324879905 |
|
}, |
|
"hendrycksTest-philosophy": { |
|
"acc": 0.2990353697749196, |
|
"acc_stderr": 0.026003301117885142, |
|
"acc_norm": 0.2990353697749196, |
|
"acc_norm_stderr": 0.026003301117885142 |
|
}, |
|
"hendrycksTest-prehistory": { |
|
"acc": 0.3117283950617284, |
|
"acc_stderr": 0.02577311116963045, |
|
"acc_norm": 0.3117283950617284, |
|
"acc_norm_stderr": 0.02577311116963045 |
|
}, |
|
"hendrycksTest-professional_accounting": { |
|
"acc": 0.2624113475177305, |
|
"acc_stderr": 0.026244920349843007, |
|
"acc_norm": 0.2624113475177305, |
|
"acc_norm_stderr": 0.026244920349843007 |
|
}, |
|
"hendrycksTest-professional_law": { |
|
"acc": 0.2803129074315515, |
|
"acc_stderr": 0.01147155594495862, |
|
"acc_norm": 0.2803129074315515, |
|
"acc_norm_stderr": 0.01147155594495862 |
|
}, |
|
"hendrycksTest-professional_medicine": { |
|
"acc": 0.19852941176470587, |
|
"acc_stderr": 0.0242310133705411, |
|
"acc_norm": 0.19852941176470587, |
|
"acc_norm_stderr": 0.0242310133705411 |
|
}, |
|
"hendrycksTest-professional_psychology": { |
|
"acc": 0.3088235294117647, |
|
"acc_stderr": 0.01869085027359529, |
|
"acc_norm": 0.3088235294117647, |
|
"acc_norm_stderr": 0.01869085027359529 |
|
}, |
|
"hendrycksTest-public_relations": { |
|
"acc": 0.3181818181818182, |
|
"acc_stderr": 0.04461272175910508, |
|
"acc_norm": 0.3181818181818182, |
|
"acc_norm_stderr": 0.04461272175910508 |
|
}, |
|
"hendrycksTest-security_studies": { |
|
"acc": 0.21224489795918366, |
|
"acc_stderr": 0.026176967197866767, |
|
"acc_norm": 0.21224489795918366, |
|
"acc_norm_stderr": 0.026176967197866767 |
|
}, |
|
"hendrycksTest-sociology": { |
|
"acc": 0.2736318407960199, |
|
"acc_stderr": 0.031524391865554, |
|
"acc_norm": 0.2736318407960199, |
|
"acc_norm_stderr": 0.031524391865554 |
|
}, |
|
"hendrycksTest-us_foreign_policy": { |
|
"acc": 0.28, |
|
"acc_stderr": 0.04512608598542128, |
|
"acc_norm": 0.28, |
|
"acc_norm_stderr": 0.04512608598542128 |
|
}, |
|
"hendrycksTest-virology": { |
|
"acc": 0.30120481927710846, |
|
"acc_stderr": 0.035716092300534796, |
|
"acc_norm": 0.30120481927710846, |
|
"acc_norm_stderr": 0.035716092300534796 |
|
}, |
|
"hendrycksTest-world_religions": { |
|
"acc": 0.38596491228070173, |
|
"acc_stderr": 0.03733756969066164, |
|
"acc_norm": 0.38596491228070173, |
|
"acc_norm_stderr": 0.03733756969066164 |
|
} |
|
}, |
|
"versions": { |
|
"hendrycksTest-abstract_algebra": 1, |
|
"hendrycksTest-anatomy": 1, |
|
"hendrycksTest-astronomy": 1, |
|
"hendrycksTest-business_ethics": 1, |
|
"hendrycksTest-clinical_knowledge": 1, |
|
"hendrycksTest-college_biology": 1, |
|
"hendrycksTest-college_chemistry": 1, |
|
"hendrycksTest-college_computer_science": 1, |
|
"hendrycksTest-college_mathematics": 1, |
|
"hendrycksTest-college_medicine": 1, |
|
"hendrycksTest-college_physics": 1, |
|
"hendrycksTest-computer_security": 1, |
|
"hendrycksTest-conceptual_physics": 1, |
|
"hendrycksTest-econometrics": 1, |
|
"hendrycksTest-electrical_engineering": 1, |
|
"hendrycksTest-elementary_mathematics": 1, |
|
"hendrycksTest-formal_logic": 1, |
|
"hendrycksTest-global_facts": 1, |
|
"hendrycksTest-high_school_biology": 1, |
|
"hendrycksTest-high_school_chemistry": 1, |
|
"hendrycksTest-high_school_computer_science": 1, |
|
"hendrycksTest-high_school_european_history": 1, |
|
"hendrycksTest-high_school_geography": 1, |
|
"hendrycksTest-high_school_government_and_politics": 1, |
|
"hendrycksTest-high_school_macroeconomics": 1, |
|
"hendrycksTest-high_school_mathematics": 1, |
|
"hendrycksTest-high_school_microeconomics": 1, |
|
"hendrycksTest-high_school_physics": 1, |
|
"hendrycksTest-high_school_psychology": 1, |
|
"hendrycksTest-high_school_statistics": 1, |
|
"hendrycksTest-high_school_us_history": 1, |
|
"hendrycksTest-high_school_world_history": 1, |
|
"hendrycksTest-human_aging": 1, |
|
"hendrycksTest-human_sexuality": 1, |
|
"hendrycksTest-international_law": 1, |
|
"hendrycksTest-jurisprudence": 1, |
|
"hendrycksTest-logical_fallacies": 1, |
|
"hendrycksTest-machine_learning": 1, |
|
"hendrycksTest-management": 1, |
|
"hendrycksTest-marketing": 1, |
|
"hendrycksTest-medical_genetics": 1, |
|
"hendrycksTest-miscellaneous": 1, |
|
"hendrycksTest-moral_disputes": 1, |
|
"hendrycksTest-moral_scenarios": 1, |
|
"hendrycksTest-nutrition": 1, |
|
"hendrycksTest-philosophy": 1, |
|
"hendrycksTest-prehistory": 1, |
|
"hendrycksTest-professional_accounting": 1, |
|
"hendrycksTest-professional_law": 1, |
|
"hendrycksTest-professional_medicine": 1, |
|
"hendrycksTest-professional_psychology": 1, |
|
"hendrycksTest-public_relations": 1, |
|
"hendrycksTest-security_studies": 1, |
|
"hendrycksTest-sociology": 1, |
|
"hendrycksTest-us_foreign_policy": 1, |
|
"hendrycksTest-virology": 1, |
|
"hendrycksTest-world_religions": 1 |
|
}, |
|
"config": { |
|
"model": "hf-causal", |
|
"model_args": "pretrained=workdir_7b/ckpt_349", |
|
"num_fewshot": 5, |
|
"batch_size": "8", |
|
"batch_sizes": [], |
|
"device": null, |
|
"no_cache": true, |
|
"limit": null, |
|
"bootstrap_iters": 100000, |
|
"description_dict": {} |
|
} |
|
} |