{ "results": { "hendrycksTest-abstract_algebra": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "hendrycksTest-anatomy": { "acc": 0.43703703703703706, "acc_stderr": 0.04284958639753399, "acc_norm": 0.43703703703703706, "acc_norm_stderr": 0.04284958639753399 }, "hendrycksTest-astronomy": { "acc": 0.40789473684210525, "acc_stderr": 0.03999309712777471, "acc_norm": 0.40789473684210525, "acc_norm_stderr": 0.03999309712777471 }, "hendrycksTest-business_ethics": { "acc": 0.46, "acc_stderr": 0.05009082659620332, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332 }, "hendrycksTest-clinical_knowledge": { "acc": 0.4830188679245283, "acc_stderr": 0.030755120364119905, "acc_norm": 0.4830188679245283, "acc_norm_stderr": 0.030755120364119905 }, "hendrycksTest-college_biology": { "acc": 0.4652777777777778, "acc_stderr": 0.04171115858181618, "acc_norm": 0.4652777777777778, "acc_norm_stderr": 0.04171115858181618 }, "hendrycksTest-college_chemistry": { "acc": 0.27, "acc_stderr": 0.044619604333847394, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394 }, "hendrycksTest-college_computer_science": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "hendrycksTest-college_mathematics": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "hendrycksTest-college_medicine": { "acc": 0.36416184971098264, "acc_stderr": 0.03669072477416907, "acc_norm": 0.36416184971098264, "acc_norm_stderr": 0.03669072477416907 }, "hendrycksTest-college_physics": { "acc": 0.2549019607843137, "acc_stderr": 0.043364327079931785, "acc_norm": 0.2549019607843137, "acc_norm_stderr": 0.043364327079931785 }, "hendrycksTest-computer_security": { "acc": 0.55, "acc_stderr": 0.05, "acc_norm": 0.55, "acc_norm_stderr": 0.05 }, "hendrycksTest-conceptual_physics": { "acc": 0.4085106382978723, "acc_stderr": 0.03213418026701576, "acc_norm": 0.4085106382978723, "acc_norm_stderr": 0.03213418026701576 }, "hendrycksTest-econometrics": { "acc": 0.2894736842105263, "acc_stderr": 0.04266339443159394, "acc_norm": 0.2894736842105263, "acc_norm_stderr": 0.04266339443159394 }, "hendrycksTest-electrical_engineering": { "acc": 0.4206896551724138, "acc_stderr": 0.0411391498118926, "acc_norm": 0.4206896551724138, "acc_norm_stderr": 0.0411391498118926 }, "hendrycksTest-elementary_mathematics": { "acc": 0.291005291005291, "acc_stderr": 0.02339382650048487, "acc_norm": 0.291005291005291, "acc_norm_stderr": 0.02339382650048487 }, "hendrycksTest-formal_logic": { "acc": 0.23015873015873015, "acc_stderr": 0.037649508797906045, "acc_norm": 0.23015873015873015, "acc_norm_stderr": 0.037649508797906045 }, "hendrycksTest-global_facts": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "hendrycksTest-high_school_biology": { "acc": 0.4870967741935484, "acc_stderr": 0.028434533152681855, "acc_norm": 0.4870967741935484, "acc_norm_stderr": 0.028434533152681855 }, "hendrycksTest-high_school_chemistry": { "acc": 0.33497536945812806, "acc_stderr": 0.033208527423483104, "acc_norm": 0.33497536945812806, "acc_norm_stderr": 0.033208527423483104 }, "hendrycksTest-high_school_computer_science": { "acc": 0.44, "acc_stderr": 0.04988876515698589, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589 }, "hendrycksTest-high_school_european_history": { "acc": 0.6363636363636364, "acc_stderr": 0.03756335775187897, "acc_norm": 0.6363636363636364, "acc_norm_stderr": 0.03756335775187897 }, "hendrycksTest-high_school_geography": { "acc": 0.5202020202020202, "acc_stderr": 0.03559443565563918, "acc_norm": 0.5202020202020202, "acc_norm_stderr": 0.03559443565563918 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.6424870466321243, "acc_stderr": 0.034588160421810114, "acc_norm": 0.6424870466321243, "acc_norm_stderr": 0.034588160421810114 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.4025641025641026, "acc_stderr": 0.02486499515976775, "acc_norm": 0.4025641025641026, "acc_norm_stderr": 0.02486499515976775 }, "hendrycksTest-high_school_mathematics": { "acc": 0.25925925925925924, "acc_stderr": 0.026719240783712177, "acc_norm": 0.25925925925925924, "acc_norm_stderr": 0.026719240783712177 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.4411764705882353, "acc_stderr": 0.0322529423239964, "acc_norm": 0.4411764705882353, "acc_norm_stderr": 0.0322529423239964 }, "hendrycksTest-high_school_physics": { "acc": 0.23841059602649006, "acc_stderr": 0.03479185572599661, "acc_norm": 0.23841059602649006, "acc_norm_stderr": 0.03479185572599661 }, "hendrycksTest-high_school_psychology": { "acc": 0.6165137614678899, "acc_stderr": 0.020847156641915984, "acc_norm": 0.6165137614678899, "acc_norm_stderr": 0.020847156641915984 }, "hendrycksTest-high_school_statistics": { "acc": 0.27314814814814814, "acc_stderr": 0.03038805130167812, "acc_norm": 0.27314814814814814, "acc_norm_stderr": 0.03038805130167812 }, "hendrycksTest-high_school_us_history": { "acc": 0.6470588235294118, "acc_stderr": 0.03354092437591518, "acc_norm": 0.6470588235294118, "acc_norm_stderr": 0.03354092437591518 }, "hendrycksTest-high_school_world_history": { "acc": 0.6455696202531646, "acc_stderr": 0.031137304297185805, "acc_norm": 0.6455696202531646, "acc_norm_stderr": 0.031137304297185805 }, "hendrycksTest-human_aging": { "acc": 0.547085201793722, "acc_stderr": 0.03340867501923324, "acc_norm": 0.547085201793722, "acc_norm_stderr": 0.03340867501923324 }, "hendrycksTest-human_sexuality": { "acc": 0.549618320610687, "acc_stderr": 0.04363643698524779, "acc_norm": 0.549618320610687, "acc_norm_stderr": 0.04363643698524779 }, "hendrycksTest-international_law": { "acc": 0.628099173553719, "acc_stderr": 0.04412015806624504, "acc_norm": 0.628099173553719, "acc_norm_stderr": 0.04412015806624504 }, "hendrycksTest-jurisprudence": { "acc": 0.49074074074074076, "acc_stderr": 0.04832853553437055, "acc_norm": 0.49074074074074076, "acc_norm_stderr": 0.04832853553437055 }, "hendrycksTest-logical_fallacies": { "acc": 0.49079754601226994, "acc_stderr": 0.03927705600787443, "acc_norm": 0.49079754601226994, "acc_norm_stderr": 0.03927705600787443 }, "hendrycksTest-machine_learning": { "acc": 0.42857142857142855, "acc_stderr": 0.04697113923010212, "acc_norm": 0.42857142857142855, "acc_norm_stderr": 0.04697113923010212 }, "hendrycksTest-management": { "acc": 0.5339805825242718, "acc_stderr": 0.0493929144727348, "acc_norm": 0.5339805825242718, "acc_norm_stderr": 0.0493929144727348 }, "hendrycksTest-marketing": { "acc": 0.7136752136752137, "acc_stderr": 0.02961432369045665, "acc_norm": 0.7136752136752137, "acc_norm_stderr": 0.02961432369045665 }, "hendrycksTest-medical_genetics": { "acc": 0.55, "acc_stderr": 0.04999999999999999, "acc_norm": 0.55, "acc_norm_stderr": 0.04999999999999999 }, "hendrycksTest-miscellaneous": { "acc": 0.6219667943805874, "acc_stderr": 0.017339844462104594, "acc_norm": 0.6219667943805874, "acc_norm_stderr": 0.017339844462104594 }, "hendrycksTest-moral_disputes": { "acc": 0.5317919075144508, "acc_stderr": 0.02686462436675664, "acc_norm": 0.5317919075144508, "acc_norm_stderr": 0.02686462436675664 }, "hendrycksTest-moral_scenarios": { "acc": 0.2424581005586592, "acc_stderr": 0.014333522059217892, "acc_norm": 0.2424581005586592, "acc_norm_stderr": 0.014333522059217892 }, "hendrycksTest-nutrition": { "acc": 0.477124183006536, "acc_stderr": 0.028599936776089782, "acc_norm": 0.477124183006536, "acc_norm_stderr": 0.028599936776089782 }, "hendrycksTest-philosophy": { "acc": 0.5530546623794212, "acc_stderr": 0.028237769422085328, "acc_norm": 0.5530546623794212, "acc_norm_stderr": 0.028237769422085328 }, "hendrycksTest-prehistory": { "acc": 0.5308641975308642, "acc_stderr": 0.027767689606833925, "acc_norm": 0.5308641975308642, "acc_norm_stderr": 0.027767689606833925 }, "hendrycksTest-professional_accounting": { "acc": 0.375886524822695, "acc_stderr": 0.028893955412115882, "acc_norm": 0.375886524822695, "acc_norm_stderr": 0.028893955412115882 }, "hendrycksTest-professional_law": { "acc": 0.35723598435462844, "acc_stderr": 0.012238615750316505, "acc_norm": 0.35723598435462844, "acc_norm_stderr": 0.012238615750316505 }, "hendrycksTest-professional_medicine": { "acc": 0.4227941176470588, "acc_stderr": 0.03000856284500349, "acc_norm": 0.4227941176470588, "acc_norm_stderr": 0.03000856284500349 }, "hendrycksTest-professional_psychology": { "acc": 0.4591503267973856, "acc_stderr": 0.020160213617222516, "acc_norm": 0.4591503267973856, "acc_norm_stderr": 0.020160213617222516 }, "hendrycksTest-public_relations": { "acc": 0.5454545454545454, "acc_stderr": 0.04769300568972744, "acc_norm": 0.5454545454545454, "acc_norm_stderr": 0.04769300568972744 }, "hendrycksTest-security_studies": { "acc": 0.49795918367346936, "acc_stderr": 0.0320089533497105, "acc_norm": 0.49795918367346936, "acc_norm_stderr": 0.0320089533497105 }, "hendrycksTest-sociology": { "acc": 0.6318407960199005, "acc_stderr": 0.03410410565495301, "acc_norm": 0.6318407960199005, "acc_norm_stderr": 0.03410410565495301 }, "hendrycksTest-us_foreign_policy": { "acc": 0.67, "acc_stderr": 0.04725815626252609, "acc_norm": 0.67, "acc_norm_stderr": 0.04725815626252609 }, "hendrycksTest-virology": { "acc": 0.41566265060240964, "acc_stderr": 0.03836722176598053, "acc_norm": 0.41566265060240964, "acc_norm_stderr": 0.03836722176598053 }, "hendrycksTest-world_religions": { "acc": 0.6549707602339181, "acc_stderr": 0.036459813773888065, "acc_norm": 0.6549707602339181, "acc_norm_stderr": 0.036459813773888065 } }, "versions": { "hendrycksTest-abstract_algebra": 1, "hendrycksTest-anatomy": 1, "hendrycksTest-astronomy": 1, "hendrycksTest-business_ethics": 1, "hendrycksTest-clinical_knowledge": 1, "hendrycksTest-college_biology": 1, "hendrycksTest-college_chemistry": 1, "hendrycksTest-college_computer_science": 1, "hendrycksTest-college_mathematics": 1, "hendrycksTest-college_medicine": 1, "hendrycksTest-college_physics": 1, "hendrycksTest-computer_security": 1, "hendrycksTest-conceptual_physics": 1, "hendrycksTest-econometrics": 1, "hendrycksTest-electrical_engineering": 1, "hendrycksTest-elementary_mathematics": 1, "hendrycksTest-formal_logic": 1, "hendrycksTest-global_facts": 1, "hendrycksTest-high_school_biology": 1, "hendrycksTest-high_school_chemistry": 1, "hendrycksTest-high_school_computer_science": 1, "hendrycksTest-high_school_european_history": 1, "hendrycksTest-high_school_geography": 1, "hendrycksTest-high_school_government_and_politics": 1, "hendrycksTest-high_school_macroeconomics": 1, "hendrycksTest-high_school_mathematics": 1, "hendrycksTest-high_school_microeconomics": 1, "hendrycksTest-high_school_physics": 1, "hendrycksTest-high_school_psychology": 1, "hendrycksTest-high_school_statistics": 1, "hendrycksTest-high_school_us_history": 1, "hendrycksTest-high_school_world_history": 1, "hendrycksTest-human_aging": 1, "hendrycksTest-human_sexuality": 1, "hendrycksTest-international_law": 1, "hendrycksTest-jurisprudence": 1, "hendrycksTest-logical_fallacies": 1, "hendrycksTest-machine_learning": 1, "hendrycksTest-management": 1, "hendrycksTest-marketing": 1, "hendrycksTest-medical_genetics": 1, "hendrycksTest-miscellaneous": 1, "hendrycksTest-moral_disputes": 1, "hendrycksTest-moral_scenarios": 1, "hendrycksTest-nutrition": 1, "hendrycksTest-philosophy": 1, "hendrycksTest-prehistory": 1, "hendrycksTest-professional_accounting": 1, "hendrycksTest-professional_law": 1, "hendrycksTest-professional_medicine": 1, "hendrycksTest-professional_psychology": 1, "hendrycksTest-public_relations": 1, "hendrycksTest-security_studies": 1, "hendrycksTest-sociology": 1, "hendrycksTest-us_foreign_policy": 1, "hendrycksTest-virology": 1, "hendrycksTest-world_religions": 1 }, "config": { "model": "hf-causal-experimental", "model_args": { "pretrained": "/network/alexandre/research/llama2_7b_ultrachat/dense/dense_finetuning/dense_LR1e-4_E1/training", "trust_remote_code": true }, "num_fewshot": 5, "batch_size": "4", "batch_sizes": [], "device": "cuda:5", "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }