Amber / eval_mmlu.json
omkarenator's picture
Upload folder using huggingface_hub
9c9840a
raw history blame
No virus
14.2 kB
{
"results": {
"hendrycksTest-abstract_algebra": {
"acc": 0.26,
"acc_stderr": 0.044084400227680814,
"acc_norm": 0.26,
"acc_norm_stderr": 0.044084400227680814
},
"hendrycksTest-anatomy": {
"acc": 0.23703703703703705,
"acc_stderr": 0.03673731683969506,
"acc_norm": 0.23703703703703705,
"acc_norm_stderr": 0.03673731683969506
},
"hendrycksTest-astronomy": {
"acc": 0.25,
"acc_stderr": 0.03523807393012047,
"acc_norm": 0.25,
"acc_norm_stderr": 0.03523807393012047
},
"hendrycksTest-business_ethics": {
"acc": 0.31,
"acc_stderr": 0.04648231987117316,
"acc_norm": 0.31,
"acc_norm_stderr": 0.04648231987117316
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.22641509433962265,
"acc_stderr": 0.02575755989310675,
"acc_norm": 0.22641509433962265,
"acc_norm_stderr": 0.02575755989310675
},
"hendrycksTest-college_biology": {
"acc": 0.2708333333333333,
"acc_stderr": 0.037161774375660185,
"acc_norm": 0.2708333333333333,
"acc_norm_stderr": 0.037161774375660185
},
"hendrycksTest-college_chemistry": {
"acc": 0.28,
"acc_stderr": 0.045126085985421276,
"acc_norm": 0.28,
"acc_norm_stderr": 0.045126085985421276
},
"hendrycksTest-college_computer_science": {
"acc": 0.42,
"acc_stderr": 0.04960449637488583,
"acc_norm": 0.42,
"acc_norm_stderr": 0.04960449637488583
},
"hendrycksTest-college_mathematics": {
"acc": 0.31,
"acc_stderr": 0.04648231987117316,
"acc_norm": 0.31,
"acc_norm_stderr": 0.04648231987117316
},
"hendrycksTest-college_medicine": {
"acc": 0.2543352601156069,
"acc_stderr": 0.0332055644308557,
"acc_norm": 0.2543352601156069,
"acc_norm_stderr": 0.0332055644308557
},
"hendrycksTest-college_physics": {
"acc": 0.21568627450980393,
"acc_stderr": 0.04092563958237656,
"acc_norm": 0.21568627450980393,
"acc_norm_stderr": 0.04092563958237656
},
"hendrycksTest-computer_security": {
"acc": 0.32,
"acc_stderr": 0.04688261722621504,
"acc_norm": 0.32,
"acc_norm_stderr": 0.04688261722621504
},
"hendrycksTest-conceptual_physics": {
"acc": 0.2978723404255319,
"acc_stderr": 0.029896145682095462,
"acc_norm": 0.2978723404255319,
"acc_norm_stderr": 0.029896145682095462
},
"hendrycksTest-econometrics": {
"acc": 0.2631578947368421,
"acc_stderr": 0.04142439719489361,
"acc_norm": 0.2631578947368421,
"acc_norm_stderr": 0.04142439719489361
},
"hendrycksTest-electrical_engineering": {
"acc": 0.2413793103448276,
"acc_stderr": 0.03565998174135303,
"acc_norm": 0.2413793103448276,
"acc_norm_stderr": 0.03565998174135303
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.2724867724867725,
"acc_stderr": 0.022930973071633356,
"acc_norm": 0.2724867724867725,
"acc_norm_stderr": 0.022930973071633356
},
"hendrycksTest-formal_logic": {
"acc": 0.30952380952380953,
"acc_stderr": 0.04134913018303316,
"acc_norm": 0.30952380952380953,
"acc_norm_stderr": 0.04134913018303316
},
"hendrycksTest-global_facts": {
"acc": 0.28,
"acc_stderr": 0.045126085985421276,
"acc_norm": 0.28,
"acc_norm_stderr": 0.045126085985421276
},
"hendrycksTest-high_school_biology": {
"acc": 0.22258064516129034,
"acc_stderr": 0.023664216671642528,
"acc_norm": 0.22258064516129034,
"acc_norm_stderr": 0.023664216671642528
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.18719211822660098,
"acc_stderr": 0.027444924966882618,
"acc_norm": 0.18719211822660098,
"acc_norm_stderr": 0.027444924966882618
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.25,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.25,
"acc_norm_stderr": 0.04351941398892446
},
"hendrycksTest-high_school_european_history": {
"acc": 0.296969696969697,
"acc_stderr": 0.035679697722680474,
"acc_norm": 0.296969696969697,
"acc_norm_stderr": 0.035679697722680474
},
"hendrycksTest-high_school_geography": {
"acc": 0.19696969696969696,
"acc_stderr": 0.028335609732463355,
"acc_norm": 0.19696969696969696,
"acc_norm_stderr": 0.028335609732463355
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.23316062176165803,
"acc_stderr": 0.030516111371476008,
"acc_norm": 0.23316062176165803,
"acc_norm_stderr": 0.030516111371476008
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.21025641025641026,
"acc_stderr": 0.020660597485026935,
"acc_norm": 0.21025641025641026,
"acc_norm_stderr": 0.020660597485026935
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.21851851851851853,
"acc_stderr": 0.025195752251823786,
"acc_norm": 0.21851851851851853,
"acc_norm_stderr": 0.025195752251823786
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.27310924369747897,
"acc_stderr": 0.02894200404099817,
"acc_norm": 0.27310924369747897,
"acc_norm_stderr": 0.02894200404099817
},
"hendrycksTest-high_school_physics": {
"acc": 0.26490066225165565,
"acc_stderr": 0.03603038545360384,
"acc_norm": 0.26490066225165565,
"acc_norm_stderr": 0.03603038545360384
},
"hendrycksTest-high_school_psychology": {
"acc": 0.23119266055045873,
"acc_stderr": 0.018075750241633156,
"acc_norm": 0.23119266055045873,
"acc_norm_stderr": 0.018075750241633156
},
"hendrycksTest-high_school_statistics": {
"acc": 0.22685185185185186,
"acc_stderr": 0.028561650102422256,
"acc_norm": 0.22685185185185186,
"acc_norm_stderr": 0.028561650102422256
},
"hendrycksTest-high_school_us_history": {
"acc": 0.2696078431372549,
"acc_stderr": 0.031145570659486782,
"acc_norm": 0.2696078431372549,
"acc_norm_stderr": 0.031145570659486782
},
"hendrycksTest-high_school_world_history": {
"acc": 0.28270042194092826,
"acc_stderr": 0.029312814153955917,
"acc_norm": 0.28270042194092826,
"acc_norm_stderr": 0.029312814153955917
},
"hendrycksTest-human_aging": {
"acc": 0.32286995515695066,
"acc_stderr": 0.03138147637575498,
"acc_norm": 0.32286995515695066,
"acc_norm_stderr": 0.03138147637575498
},
"hendrycksTest-human_sexuality": {
"acc": 0.3435114503816794,
"acc_stderr": 0.041649760719448786,
"acc_norm": 0.3435114503816794,
"acc_norm_stderr": 0.041649760719448786
},
"hendrycksTest-international_law": {
"acc": 0.2727272727272727,
"acc_stderr": 0.04065578140908705,
"acc_norm": 0.2727272727272727,
"acc_norm_stderr": 0.04065578140908705
},
"hendrycksTest-jurisprudence": {
"acc": 0.26851851851851855,
"acc_stderr": 0.04284467968052192,
"acc_norm": 0.26851851851851855,
"acc_norm_stderr": 0.04284467968052192
},
"hendrycksTest-logical_fallacies": {
"acc": 0.2147239263803681,
"acc_stderr": 0.03226219377286774,
"acc_norm": 0.2147239263803681,
"acc_norm_stderr": 0.03226219377286774
},
"hendrycksTest-machine_learning": {
"acc": 0.25892857142857145,
"acc_stderr": 0.04157751539865629,
"acc_norm": 0.25892857142857145,
"acc_norm_stderr": 0.04157751539865629
},
"hendrycksTest-management": {
"acc": 0.18446601941747573,
"acc_stderr": 0.03840423627288276,
"acc_norm": 0.18446601941747573,
"acc_norm_stderr": 0.03840423627288276
},
"hendrycksTest-marketing": {
"acc": 0.3076923076923077,
"acc_stderr": 0.030236389942173106,
"acc_norm": 0.3076923076923077,
"acc_norm_stderr": 0.030236389942173106
},
"hendrycksTest-medical_genetics": {
"acc": 0.44,
"acc_stderr": 0.04988876515698589,
"acc_norm": 0.44,
"acc_norm_stderr": 0.04988876515698589
},
"hendrycksTest-miscellaneous": {
"acc": 0.26947637292464877,
"acc_stderr": 0.015866243073215068,
"acc_norm": 0.26947637292464877,
"acc_norm_stderr": 0.015866243073215068
},
"hendrycksTest-moral_disputes": {
"acc": 0.30346820809248554,
"acc_stderr": 0.024752411960917212,
"acc_norm": 0.30346820809248554,
"acc_norm_stderr": 0.024752411960917212
},
"hendrycksTest-moral_scenarios": {
"acc": 0.23798882681564246,
"acc_stderr": 0.014242630070574915,
"acc_norm": 0.23798882681564246,
"acc_norm_stderr": 0.014242630070574915
},
"hendrycksTest-nutrition": {
"acc": 0.2549019607843137,
"acc_stderr": 0.024954184324879905,
"acc_norm": 0.2549019607843137,
"acc_norm_stderr": 0.024954184324879905
},
"hendrycksTest-philosophy": {
"acc": 0.2990353697749196,
"acc_stderr": 0.026003301117885142,
"acc_norm": 0.2990353697749196,
"acc_norm_stderr": 0.026003301117885142
},
"hendrycksTest-prehistory": {
"acc": 0.3117283950617284,
"acc_stderr": 0.02577311116963045,
"acc_norm": 0.3117283950617284,
"acc_norm_stderr": 0.02577311116963045
},
"hendrycksTest-professional_accounting": {
"acc": 0.2624113475177305,
"acc_stderr": 0.026244920349843007,
"acc_norm": 0.2624113475177305,
"acc_norm_stderr": 0.026244920349843007
},
"hendrycksTest-professional_law": {
"acc": 0.2803129074315515,
"acc_stderr": 0.01147155594495862,
"acc_norm": 0.2803129074315515,
"acc_norm_stderr": 0.01147155594495862
},
"hendrycksTest-professional_medicine": {
"acc": 0.19852941176470587,
"acc_stderr": 0.0242310133705411,
"acc_norm": 0.19852941176470587,
"acc_norm_stderr": 0.0242310133705411
},
"hendrycksTest-professional_psychology": {
"acc": 0.3088235294117647,
"acc_stderr": 0.01869085027359529,
"acc_norm": 0.3088235294117647,
"acc_norm_stderr": 0.01869085027359529
},
"hendrycksTest-public_relations": {
"acc": 0.3181818181818182,
"acc_stderr": 0.04461272175910508,
"acc_norm": 0.3181818181818182,
"acc_norm_stderr": 0.04461272175910508
},
"hendrycksTest-security_studies": {
"acc": 0.21224489795918366,
"acc_stderr": 0.026176967197866767,
"acc_norm": 0.21224489795918366,
"acc_norm_stderr": 0.026176967197866767
},
"hendrycksTest-sociology": {
"acc": 0.2736318407960199,
"acc_stderr": 0.031524391865554,
"acc_norm": 0.2736318407960199,
"acc_norm_stderr": 0.031524391865554
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.28,
"acc_stderr": 0.04512608598542128,
"acc_norm": 0.28,
"acc_norm_stderr": 0.04512608598542128
},
"hendrycksTest-virology": {
"acc": 0.30120481927710846,
"acc_stderr": 0.035716092300534796,
"acc_norm": 0.30120481927710846,
"acc_norm_stderr": 0.035716092300534796
},
"hendrycksTest-world_religions": {
"acc": 0.38596491228070173,
"acc_stderr": 0.03733756969066164,
"acc_norm": 0.38596491228070173,
"acc_norm_stderr": 0.03733756969066164
}
},
"versions": {
"hendrycksTest-abstract_algebra": 1,
"hendrycksTest-anatomy": 1,
"hendrycksTest-astronomy": 1,
"hendrycksTest-business_ethics": 1,
"hendrycksTest-clinical_knowledge": 1,
"hendrycksTest-college_biology": 1,
"hendrycksTest-college_chemistry": 1,
"hendrycksTest-college_computer_science": 1,
"hendrycksTest-college_mathematics": 1,
"hendrycksTest-college_medicine": 1,
"hendrycksTest-college_physics": 1,
"hendrycksTest-computer_security": 1,
"hendrycksTest-conceptual_physics": 1,
"hendrycksTest-econometrics": 1,
"hendrycksTest-electrical_engineering": 1,
"hendrycksTest-elementary_mathematics": 1,
"hendrycksTest-formal_logic": 1,
"hendrycksTest-global_facts": 1,
"hendrycksTest-high_school_biology": 1,
"hendrycksTest-high_school_chemistry": 1,
"hendrycksTest-high_school_computer_science": 1,
"hendrycksTest-high_school_european_history": 1,
"hendrycksTest-high_school_geography": 1,
"hendrycksTest-high_school_government_and_politics": 1,
"hendrycksTest-high_school_macroeconomics": 1,
"hendrycksTest-high_school_mathematics": 1,
"hendrycksTest-high_school_microeconomics": 1,
"hendrycksTest-high_school_physics": 1,
"hendrycksTest-high_school_psychology": 1,
"hendrycksTest-high_school_statistics": 1,
"hendrycksTest-high_school_us_history": 1,
"hendrycksTest-high_school_world_history": 1,
"hendrycksTest-human_aging": 1,
"hendrycksTest-human_sexuality": 1,
"hendrycksTest-international_law": 1,
"hendrycksTest-jurisprudence": 1,
"hendrycksTest-logical_fallacies": 1,
"hendrycksTest-machine_learning": 1,
"hendrycksTest-management": 1,
"hendrycksTest-marketing": 1,
"hendrycksTest-medical_genetics": 1,
"hendrycksTest-miscellaneous": 1,
"hendrycksTest-moral_disputes": 1,
"hendrycksTest-moral_scenarios": 1,
"hendrycksTest-nutrition": 1,
"hendrycksTest-philosophy": 1,
"hendrycksTest-prehistory": 1,
"hendrycksTest-professional_accounting": 1,
"hendrycksTest-professional_law": 1,
"hendrycksTest-professional_medicine": 1,
"hendrycksTest-professional_psychology": 1,
"hendrycksTest-public_relations": 1,
"hendrycksTest-security_studies": 1,
"hendrycksTest-sociology": 1,
"hendrycksTest-us_foreign_policy": 1,
"hendrycksTest-virology": 1,
"hendrycksTest-world_religions": 1
},
"config": {
"model": "hf-causal",
"model_args": "pretrained=workdir_7b/ckpt_349",
"num_fewshot": 5,
"batch_size": "8",
"batch_sizes": [],
"device": null,
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}