{ "results": { "harness|arc:challenge|25": { "acc": 0.6262798634812287, "acc_stderr": 0.014137708601759091, "acc_norm": 0.6732081911262798, "acc_norm_stderr": 0.013706665975587333 }, "harness|hellaswag|10": { "acc": 0.6760605457080263, "acc_stderr": 0.00467020812857923, "acc_norm": 0.8733320055765784, "acc_norm_stderr": 0.0033192094001351187 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.04725815626252605, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252605 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6296296296296297, "acc_stderr": 0.04171654161354544, "acc_norm": 0.6296296296296297, "acc_norm_stderr": 0.04171654161354544 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.8092105263157895, "acc_stderr": 0.031975658210325, "acc_norm": 0.8092105263157895, "acc_norm_stderr": 0.031975658210325 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.72, "acc_stderr": 0.04512608598542127, "acc_norm": 0.72, "acc_norm_stderr": 0.04512608598542127 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7169811320754716, "acc_stderr": 0.027724236492700918, "acc_norm": 0.7169811320754716, "acc_norm_stderr": 0.027724236492700918 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.8472222222222222, "acc_stderr": 0.030085743248565666, "acc_norm": 0.8472222222222222, "acc_norm_stderr": 0.030085743248565666 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.51, "acc_stderr": 0.05024183937956912, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.6, "acc_stderr": 0.049236596391733084, "acc_norm": 0.6, "acc_norm_stderr": 0.049236596391733084 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.37, "acc_stderr": 0.048523658709391, "acc_norm": 0.37, "acc_norm_stderr": 0.048523658709391 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6416184971098265, "acc_stderr": 0.03656343653353159, "acc_norm": 0.6416184971098265, "acc_norm_stderr": 0.03656343653353159 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.37254901960784315, "acc_stderr": 0.04810840148082635, "acc_norm": 0.37254901960784315, "acc_norm_stderr": 0.04810840148082635 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.77, "acc_stderr": 0.04229525846816506, "acc_norm": 0.77, "acc_norm_stderr": 0.04229525846816506 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.6638297872340425, "acc_stderr": 0.030881618520676942, "acc_norm": 0.6638297872340425, "acc_norm_stderr": 0.030881618520676942 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4473684210526316, "acc_stderr": 0.04677473004491199, "acc_norm": 0.4473684210526316, "acc_norm_stderr": 0.04677473004491199 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6551724137931034, "acc_stderr": 0.03960933549451207, "acc_norm": 0.6551724137931034, "acc_norm_stderr": 0.03960933549451207 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.43386243386243384, "acc_stderr": 0.025525034382474894, "acc_norm": 0.43386243386243384, "acc_norm_stderr": 0.025525034382474894 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.47619047619047616, "acc_stderr": 0.04467062628403273, "acc_norm": 0.47619047619047616, "acc_norm_stderr": 0.04467062628403273 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.46, "acc_stderr": 0.05009082659620332, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8193548387096774, "acc_stderr": 0.02188617856717253, "acc_norm": 0.8193548387096774, "acc_norm_stderr": 0.02188617856717253 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5123152709359606, "acc_stderr": 0.035169204442208966, "acc_norm": 0.5123152709359606, "acc_norm_stderr": 0.035169204442208966 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.79, "acc_stderr": 0.040936018074033256, "acc_norm": 0.79, "acc_norm_stderr": 0.040936018074033256 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.8303030303030303, "acc_stderr": 0.029311188674983134, "acc_norm": 0.8303030303030303, "acc_norm_stderr": 0.029311188674983134 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8787878787878788, "acc_stderr": 0.023253157951942084, "acc_norm": 0.8787878787878788, "acc_norm_stderr": 0.023253157951942084 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9430051813471503, "acc_stderr": 0.016731085293607555, "acc_norm": 0.9430051813471503, "acc_norm_stderr": 0.016731085293607555 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.7410256410256411, "acc_stderr": 0.02221110681006167, "acc_norm": 0.7410256410256411, "acc_norm_stderr": 0.02221110681006167 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.35555555555555557, "acc_stderr": 0.029185714949857403, "acc_norm": 0.35555555555555557, "acc_norm_stderr": 0.029185714949857403 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.7647058823529411, "acc_stderr": 0.02755361446786381, "acc_norm": 0.7647058823529411, "acc_norm_stderr": 0.02755361446786381 }, "harness|hendrycksTest-high_school_physics|5": { "acc": 0.4304635761589404, "acc_stderr": 0.04042809961395634, "acc_norm": 0.4304635761589404, "acc_norm_stderr": 0.04042809961395634 }, "harness|hendrycksTest-high_school_psychology|5": { "acc": 0.8733944954128441, "acc_stderr": 0.014257128686165169, "acc_norm": 0.8733944954128441, "acc_norm_stderr": 0.014257128686165169 }, "harness|hendrycksTest-high_school_statistics|5": { "acc": 0.6342592592592593, "acc_stderr": 0.032847388576472056, "acc_norm": 0.6342592592592593, "acc_norm_stderr": 0.032847388576472056 }, "harness|hendrycksTest-high_school_us_history|5": { "acc": 0.8970588235294118, "acc_stderr": 0.02132833757080437, "acc_norm": 0.8970588235294118, "acc_norm_stderr": 0.02132833757080437 }, "harness|hendrycksTest-high_school_world_history|5": { "acc": 0.8776371308016878, "acc_stderr": 0.021331741829746786, "acc_norm": 0.8776371308016878, "acc_norm_stderr": 0.021331741829746786 }, "harness|hendrycksTest-human_aging|5": { "acc": 0.8026905829596412, "acc_stderr": 0.02670985334496796, "acc_norm": 0.8026905829596412, "acc_norm_stderr": 0.02670985334496796 }, "harness|hendrycksTest-human_sexuality|5": { "acc": 0.8778625954198473, "acc_stderr": 0.028718776889342344, "acc_norm": 0.8778625954198473, "acc_norm_stderr": 0.028718776889342344 }, "harness|hendrycksTest-international_law|5": { "acc": 0.8760330578512396, "acc_stderr": 0.03008309871603521, "acc_norm": 0.8760330578512396, "acc_norm_stderr": 0.03008309871603521 }, "harness|hendrycksTest-jurisprudence|5": { "acc": 0.8333333333333334, "acc_stderr": 0.03602814176392645, "acc_norm": 0.8333333333333334, "acc_norm_stderr": 0.03602814176392645 }, "harness|hendrycksTest-logical_fallacies|5": { "acc": 0.803680981595092, "acc_stderr": 0.031207970394709218, "acc_norm": 0.803680981595092, "acc_norm_stderr": 0.031207970394709218 }, "harness|hendrycksTest-machine_learning|5": { "acc": 0.5357142857142857, "acc_stderr": 0.04733667890053756, "acc_norm": 0.5357142857142857, "acc_norm_stderr": 0.04733667890053756 }, "harness|hendrycksTest-management|5": { "acc": 0.8349514563106796, "acc_stderr": 0.03675668832233188, "acc_norm": 0.8349514563106796, "acc_norm_stderr": 0.03675668832233188 }, "harness|hendrycksTest-marketing|5": { "acc": 0.905982905982906, "acc_stderr": 0.01911989279892498, "acc_norm": 0.905982905982906, "acc_norm_stderr": 0.01911989279892498 }, "harness|hendrycksTest-medical_genetics|5": { "acc": 0.74, "acc_stderr": 0.04408440022768077, "acc_norm": 0.74, "acc_norm_stderr": 0.04408440022768077 }, "harness|hendrycksTest-miscellaneous|5": { "acc": 0.8620689655172413, "acc_stderr": 0.012331009307795656, "acc_norm": 0.8620689655172413, "acc_norm_stderr": 0.012331009307795656 }, "harness|hendrycksTest-moral_disputes|5": { "acc": 0.7774566473988439, "acc_stderr": 0.02239421566194282, "acc_norm": 0.7774566473988439, "acc_norm_stderr": 0.02239421566194282 }, "harness|hendrycksTest-moral_scenarios|5": { "acc": 0.4547486033519553, "acc_stderr": 0.016653875777524012, "acc_norm": 0.4547486033519553, "acc_norm_stderr": 0.016653875777524012 }, "harness|hendrycksTest-nutrition|5": { "acc": 0.7810457516339869, "acc_stderr": 0.02367908986180772, "acc_norm": 0.7810457516339869, "acc_norm_stderr": 0.02367908986180772 }, "harness|hendrycksTest-philosophy|5": { "acc": 0.7877813504823151, "acc_stderr": 0.023222756797435115, "acc_norm": 0.7877813504823151, "acc_norm_stderr": 0.023222756797435115 }, "harness|hendrycksTest-prehistory|5": { "acc": 0.8364197530864198, "acc_stderr": 0.020581466138257114, "acc_norm": 0.8364197530864198, "acc_norm_stderr": 0.020581466138257114 }, "harness|hendrycksTest-professional_accounting|5": { "acc": 0.5673758865248227, "acc_stderr": 0.02955545423677884, "acc_norm": 0.5673758865248227, "acc_norm_stderr": 0.02955545423677884 }, "harness|hendrycksTest-professional_law|5": { "acc": 0.5319426336375489, "acc_stderr": 0.012744149704869645, "acc_norm": 0.5319426336375489, "acc_norm_stderr": 0.012744149704869645 }, "harness|hendrycksTest-professional_medicine|5": { "acc": 0.75, "acc_stderr": 0.026303648393696036, "acc_norm": 0.75, "acc_norm_stderr": 0.026303648393696036 }, "harness|hendrycksTest-professional_psychology|5": { "acc": 0.7565359477124183, "acc_stderr": 0.01736247376214662, "acc_norm": 0.7565359477124183, "acc_norm_stderr": 0.01736247376214662 }, "harness|hendrycksTest-public_relations|5": { "acc": 0.6909090909090909, "acc_stderr": 0.044262946482000985, "acc_norm": 0.6909090909090909, "acc_norm_stderr": 0.044262946482000985 }, "harness|hendrycksTest-security_studies|5": { "acc": 0.7918367346938775, "acc_stderr": 0.0259911176728133, "acc_norm": 0.7918367346938775, "acc_norm_stderr": 0.0259911176728133 }, "harness|hendrycksTest-sociology|5": { "acc": 0.900497512437811, "acc_stderr": 0.021166216304659393, "acc_norm": 0.900497512437811, "acc_norm_stderr": 0.021166216304659393 }, "harness|hendrycksTest-us_foreign_policy|5": { "acc": 0.92, "acc_stderr": 0.0272659924344291, "acc_norm": 0.92, "acc_norm_stderr": 0.0272659924344291 }, "harness|hendrycksTest-virology|5": { "acc": 0.5301204819277109, "acc_stderr": 0.03885425420866767, "acc_norm": 0.5301204819277109, "acc_norm_stderr": 0.03885425420866767 }, "harness|hendrycksTest-world_religions|5": { "acc": 0.8538011695906432, "acc_stderr": 0.027097290118070806, "acc_norm": 0.8538011695906432, "acc_norm_stderr": 0.027097290118070806 }, "harness|truthfulqa:mc|0": { "mc1": 0.3108935128518972, "mc1_stderr": 0.016203316673559696, "mc2": 0.44923493721887353, "mc2_stderr": 0.01390226410719232 }, "all": { "acc": 0.6967225637378714, "acc_stderr": 0.030867069907791145, "acc_norm": 0.7008615431872544, "acc_norm_stderr": 0.030836865817034945, "mc1": 0.3108935128518972, "mc1_stderr": 0.016203316673559696, "mc2": 0.44923493721887353, "mc2_stderr": 0.01390226410719232 } }, "versions": { "harness|arc:challenge|25": 0, "harness|hellaswag|10": 0, "harness|hendrycksTest-abstract_algebra|5": 1, "harness|hendrycksTest-anatomy|5": 1, "harness|hendrycksTest-astronomy|5": 1, "harness|hendrycksTest-business_ethics|5": 1, "harness|hendrycksTest-clinical_knowledge|5": 1, "harness|hendrycksTest-college_biology|5": 1, "harness|hendrycksTest-college_chemistry|5": 1, "harness|hendrycksTest-college_computer_science|5": 1, "harness|hendrycksTest-college_mathematics|5": 1, "harness|hendrycksTest-college_medicine|5": 1, "harness|hendrycksTest-college_physics|5": 1, "harness|hendrycksTest-computer_security|5": 1, "harness|hendrycksTest-conceptual_physics|5": 1, "harness|hendrycksTest-econometrics|5": 1, "harness|hendrycksTest-electrical_engineering|5": 1, "harness|hendrycksTest-elementary_mathematics|5": 1, "harness|hendrycksTest-formal_logic|5": 1, "harness|hendrycksTest-global_facts|5": 1, "harness|hendrycksTest-high_school_biology|5": 1, "harness|hendrycksTest-high_school_chemistry|5": 1, "harness|hendrycksTest-high_school_computer_science|5": 1, "harness|hendrycksTest-high_school_european_history|5": 1, "harness|hendrycksTest-high_school_geography|5": 1, "harness|hendrycksTest-high_school_government_and_politics|5": 1, "harness|hendrycksTest-high_school_macroeconomics|5": 1, "harness|hendrycksTest-high_school_mathematics|5": 1, "harness|hendrycksTest-high_school_microeconomics|5": 1, "harness|hendrycksTest-high_school_physics|5": 1, "harness|hendrycksTest-high_school_psychology|5": 1, "harness|hendrycksTest-high_school_statistics|5": 1, "harness|hendrycksTest-high_school_us_history|5": 1, "harness|hendrycksTest-high_school_world_history|5": 1, "harness|hendrycksTest-human_aging|5": 1, "harness|hendrycksTest-human_sexuality|5": 1, "harness|hendrycksTest-international_law|5": 1, "harness|hendrycksTest-jurisprudence|5": 1, "harness|hendrycksTest-logical_fallacies|5": 1, "harness|hendrycksTest-machine_learning|5": 1, "harness|hendrycksTest-management|5": 1, "harness|hendrycksTest-marketing|5": 1, "harness|hendrycksTest-medical_genetics|5": 1, "harness|hendrycksTest-miscellaneous|5": 1, "harness|hendrycksTest-moral_disputes|5": 1, "harness|hendrycksTest-moral_scenarios|5": 1, "harness|hendrycksTest-nutrition|5": 1, "harness|hendrycksTest-philosophy|5": 1, "harness|hendrycksTest-prehistory|5": 1, "harness|hendrycksTest-professional_accounting|5": 1, "harness|hendrycksTest-professional_law|5": 1, "harness|hendrycksTest-professional_medicine|5": 1, "harness|hendrycksTest-professional_psychology|5": 1, "harness|hendrycksTest-public_relations|5": 1, "harness|hendrycksTest-security_studies|5": 1, "harness|hendrycksTest-sociology|5": 1, "harness|hendrycksTest-us_foreign_policy|5": 1, "harness|hendrycksTest-virology|5": 1, "harness|hendrycksTest-world_religions|5": 1, "harness|truthfulqa:mc|0": 1, "all": 0 }, "config": { "model_name": "meta-llama/Llama-2-70b-hf", "model_sha": "ed7b07231238f836b99bf45701b9a0063576b194", "model_dtype": "torch.float16", "lighteval_sha": "d2e819bc028044e701a13b954d3326ceddb71b98", "num_few_shot_default": 0, "num_fewshot_seeds": 1, "override_batch_size": 1, "max_samples": null } }